diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
275 files changed, 25 insertions, 184409 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index 8399be770bb0..3d68a68ba819 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -67,13 +67,15 @@ * on capital-f functions. */ #include <sys/errno.h> +#include <sys/param.h> +#include <sys/types.h> #ifndef illumos #include <sys/time.h> #endif #include <sys/stat.h> -#include <sys/modctl.h> #include <sys/conf.h> #include <sys/systm.h> +#include <sys/endian.h> #ifdef illumos #include <sys/ddi.h> #include <sys/sunddi.h> @@ -96,7 +98,6 @@ #include <sys/panic.h> #include <sys/priv_impl.h> #endif -#include <sys/policy.h> #ifdef illumos #include <sys/cred_impl.h> #include <sys/procfs_isa.h> @@ -119,6 +120,7 @@ #include <sys/limits.h> #include <sys/linker.h> #include <sys/kdb.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/lock.h> @@ -129,6 +131,13 @@ #include <sys/sx.h> #include <sys/sysctl.h> + +#include <sys/mount.h> +#undef AT_UID +#undef AT_GID +#include <sys/vnode.h> +#include <sys/cred.h> + #include <sys/dtrace_bsd.h> #include <netinet/in.h> @@ -299,8 +308,10 @@ static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ #define ipaddr_t in_addr_t #define mod_modname pathname #define vuprintf vprintf +#ifndef crgetzoneid +#define crgetzoneid(_a) 0 +#endif #define ttoproc(_a) ((_a)->td_proc) -#define crgetzoneid(_a) 0 #define SNOCD 0 #define CPU_ON_INTR(_a) 0 @@ -491,7 +502,7 @@ do { \ if ((remp) != NULL) { \ *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c index d5be43f0c3d1..4771a67a9f09 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c @@ -35,6 +35,7 @@ #include <sys/atomic.h> #include <sys/errno.h> #include <sys/stat.h> +#include <sys/endian.h> #include <sys/modctl.h> #include <sys/conf.h> #include <sys/systm.h> @@ -54,6 +55,8 @@ #include <sys/dtrace_impl.h> #include <sys/sysmacros.h> #include <sys/proc.h> +#undef AT_UID +#undef AT_GID #include <sys/policy.h> #ifdef illumos #include <util/qsort.h> diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c deleted file mode 100644 index 6d82470d220a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ - -/* - * University Copyright- Copyright (c) 1982, 1986, 1988 - * The Regents of the University of California - * All Rights Reserved - * - * University Acknowledgment- Portions of this document are derived from - * software developed by the University of California, Berkeley, and its - * contributors. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/proc.h> -#include <sys/taskq.h> -#include <sys/vnode.h> - -/* Extensible attribute (xva) routines. */ - -/* - * Zero out the structure, set the size of the requested/returned bitmaps, - * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer - * to the returned attributes array. - */ -void -xva_init(xvattr_t *xvap) -{ - bzero(xvap, sizeof (xvattr_t)); - xvap->xva_mapsize = XVA_MAPSIZE; - xvap->xva_magic = XVA_MAGIC; - xvap->xva_vattr.va_mask = AT_XVATTR; - xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; -} - -/* - * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t - * structure. Otherwise, returns NULL. - */ -xoptattr_t * -xva_getxoptattr(xvattr_t *xvap) -{ - xoptattr_t *xoap = NULL; - if (xvap->xva_vattr.va_mask & AT_XVATTR) - xoap = &xvap->xva_xoptattrs; - return (xoap); -} - -/* - * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it - * asynchronously using a taskq. This can avoid deadlocks caused by re-entering - * the file system as a result of releasing the vnode. Note, file systems - * already have to handle the race where the vnode is incremented before the - * inactive routine is called and does its locking. - * - * Warning: Excessive use of this routine can lead to performance problems. - * This is because taskqs throttle back allocation if too many are created. - */ -void -vn_rele_async(vnode_t *vp, taskq_t *taskq) -{ - VERIFY(vp->v_count > 0); - if (refcount_release_if_not_last(&vp->v_usecount)) { - return; - } - VERIFY(taskq_dispatch((taskq_t *)taskq, - (task_func_t *)vrele, vp, TQ_SLEEP) != 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash deleted file mode 100644 index e558b2a50358..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2011 Google, Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip deleted file mode 100644 index f98cb76dfc91..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip +++ /dev/null @@ -1 +0,0 @@ -CITYHASH CHECKSUM FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 deleted file mode 100644 index 722cc75f01e9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 +++ /dev/null @@ -1,30 +0,0 @@ -LZ4 - Fast LZ compression algorithm -Copyright (C) 2011-2013, Yann Collet. -BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -You can contact the author at : -- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html -- LZ4 source repository : http://code.google.com/p/lz4/ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip deleted file mode 100644 index 211f679b5749..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip +++ /dev/null @@ -1 +0,0 @@ -LZ4 COMPRESSION FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c deleted file mode 100644 index 1843c8161038..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c +++ /dev/null @@ -1,960 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -/* - * ARC buffer data (ABD). - * - * ABDs are an abstract data structure for the ARC which can use two - * different ways of storing the underlying data: - * - * (a) Linear buffer. In this case, all the data in the ABD is stored in one - * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). - * - * +-------------------+ - * | ABD (linear) | - * | abd_flags = ... | - * | abd_size = ... | +--------------------------------+ - * | abd_buf ------------->| raw buffer of size abd_size | - * +-------------------+ +--------------------------------+ - * no abd_chunks - * - * (b) Scattered buffer. In this case, the data in the ABD is split into - * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers - * to the chunks recorded in an array at the end of the ABD structure. - * - * +-------------------+ - * | ABD (scattered) | - * | abd_flags = ... | - * | abd_size = ... | - * | abd_offset = 0 | +-----------+ - * | abd_chunks[0] ----------------------------->| chunk 0 | - * | abd_chunks[1] ---------------------+ +-----------+ - * | ... | | +-----------+ - * | abd_chunks[N-1] ---------+ +------->| chunk 1 | - * +-------------------+ | +-----------+ - * | ... - * | +-----------+ - * +----------------->| chunk N-1 | - * +-----------+ - * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. - */ - -#include <sys/abd.h> -#include <sys/param.h> -#include <sys/zio.h> -#include <sys/zfs_context.h> -#include <sys/zfs_znode.h> - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ -boolean_t zfs_abd_scatter_enabled = B_TRUE; - -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 4096; - -#if defined(__FreeBSD__) && defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, - &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); -#endif - -#ifdef _KERNEL -extern vmem_t *zio_alloc_arena; -#endif - -kmem_cache_t *abd_chunk_cache; -static kstat_t *abd_ksp; - -extern inline boolean_t abd_is_linear(abd_t *abd); -extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); -extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); -extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_zero(abd_t *abd, size_t size); - -static void * -abd_alloc_chunk() -{ - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - return (c); -} - -static void -abd_free_chunk(void *c) -{ - kmem_cache_free(abd_chunk_cache, c); -} - -void -abd_init(void) -{ -#ifdef illumos - vmem_t *data_alloc_arena = NULL; - -#ifdef _KERNEL - data_alloc_arena = zio_alloc_arena; -#endif - - /* - * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH - * so that no allocator metadata is stored with the buffers. - */ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); -#else - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); -#endif - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; -} - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); -} - -static inline size_t -abd_scatter_chunkcnt(abd_t *abd) -{ - ASSERT(!abd_is_linear(abd)); - return (abd_chunkcnt_for_bytes( - abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); -} - -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(size_t chunkcnt) -{ - size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, size); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - kmem_free(abd, size); - ABDSTAT_INCR(abdstat_struct_size, -size); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - size_t n = abd_chunkcnt_for_bytes(size); - abd_t *abd = abd_alloc_struct(n); - - abd->abd_flags = ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - for (int i = 0; i < n; i++) { - void *c = abd_alloc_chunk(); - ASSERT3P(c, !=, NULL); - abd->abd_u.abd_scatter.abd_chunks[i] = c; - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) -{ - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); - - abd_free_struct(abd); -} - -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(0); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc_linear(size, is_metadata)); -} - -/* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. - */ -abd_t * -abd_get_offset(abd_t *sabd, size_t off) -{ - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(0); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); - - abd = abd_alloc_struct(chunkcnt); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - /* Copy the scatterlist starting at the correct offset */ - (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], - chunkcnt * sizeof (void *)); - } - - abd->abd_size = sabd->abd_size - off; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(0); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -struct abd_iter { - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; /* position (relative to abd_offset) */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ -}; - -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); -} - -/* - * Initialize the abd_iter. - */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = zfs_abd_chunk_size - offset; - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; - } - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ -void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); -} - -/* - * Zero out the abd from a particular offset to the end. - */ -void -abd_zero_off(abd_t *abd, size_t off, size_t size) -{ - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); -} - -/* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. - */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd); - abd_iter_init(&saiter, sabd); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) -{ - (void) memcpy(dbuf, sbuf, size); - return (0); -} - -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} - -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} - -/* - * Compares the first size bytes of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) -{ - return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c deleted file mode 100644 index 713ff2b0116c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c +++ /dev/null @@ -1,234 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2017, 2018 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/aggsum.h> - -/* - * Aggregate-sum counters are a form of fanned-out counter, used when atomic - * instructions on a single field cause enough CPU cache line contention to - * slow system performance. Due to their increased overhead and the expense - * involved with precisely reading from them, they should only be used in cases - * where the write rate (increment/decrement) is much higher than the read rate - * (get value). - * - * Aggregate sum counters are comprised of two basic parts, the core and the - * buckets. The core counter contains a lock for the entire counter, as well - * as the current upper and lower bounds on the value of the counter. The - * aggsum_bucket structure contains a per-bucket lock to protect the contents of - * the bucket, the current amount that this bucket has changed from the global - * counter (called the delta), and the amount of increment and decrement we have - * "borrowed" from the core counter. - * - * The basic operation of an aggsum is simple. Threads that wish to modify the - * counter will modify one bucket's counter (determined by their current CPU, to - * help minimize lock and cache contention). If the bucket already has - * sufficient capacity borrowed from the core structure to handle their request, - * they simply modify the delta and return. If the bucket does not, we clear - * the bucket's current state (to prevent the borrowed amounts from getting too - * large), and borrow more from the core counter. Borrowing is done by adding to - * the upper bound (or subtracting from the lower bound) of the core counter, - * and setting the borrow value for the bucket to the amount added (or - * subtracted). Clearing the bucket is the opposite; we add the current delta - * to both the lower and upper bounds of the core counter, subtract the borrowed - * incremental from the upper bound, and add the borrowed decrement from the - * lower bound. Note that only borrowing and clearing require access to the - * core counter; since all other operations access CPU-local resources, - * performance can be much higher than a traditional counter. - * - * Threads that wish to read from the counter have a slightly more challenging - * task. It is fast to determine the upper and lower bounds of the aggum; this - * does not require grabbing any locks. This suffices for cases where an - * approximation of the aggsum's value is acceptable. However, if one needs to - * know whether some specific value is above or below the current value in the - * aggsum, they invoke aggsum_compare(). This function operates by repeatedly - * comparing the target value to the upper and lower bounds of the aggsum, and - * then clearing a bucket. This proceeds until the target is outside of the - * upper and lower bounds and we return a response, or the last bucket has been - * cleared and we know that the target is equal to the aggsum's value. Finally, - * the most expensive operation is determining the precise value of the aggsum. - * To do this, we clear every bucket and then return the upper bound (which must - * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() - * expensive is clearing buckets. This involves grabbing the global lock - * (serializing against themselves and borrow operations), grabbing a bucket's - * lock (preventing threads on those CPUs from modifying their delta), and - * zeroing out the borrowed value (forcing that thread to borrow on its next - * request, which will also be expensive). This is what makes aggsums well - * suited for write-many read-rarely operations. - */ - -/* - * We will borrow aggsum_borrow_multiplier times the current request, so we will - * have to get the as_lock approximately every aggsum_borrow_multiplier calls to - * aggsum_delta(). - */ -static uint_t aggsum_borrow_multiplier = 10; - -void -aggsum_init(aggsum_t *as, uint64_t value) -{ - bzero(as, sizeof (*as)); - as->as_lower_bound = as->as_upper_bound = value; - mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); - as->as_numbuckets = boot_ncpus; - as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), - KM_SLEEP); - for (int i = 0; i < as->as_numbuckets; i++) { - mutex_init(&as->as_buckets[i].asc_lock, - NULL, MUTEX_DEFAULT, NULL); - } -} - -void -aggsum_fini(aggsum_t *as) -{ - for (int i = 0; i < as->as_numbuckets; i++) - mutex_destroy(&as->as_buckets[i].asc_lock); - kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); - mutex_destroy(&as->as_lock); -} - -int64_t -aggsum_lower_bound(aggsum_t *as) -{ - return (as->as_lower_bound); -} - -int64_t -aggsum_upper_bound(aggsum_t *as) -{ - return (as->as_upper_bound); -} - -static void -aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) -{ - ASSERT(MUTEX_HELD(&as->as_lock)); - ASSERT(MUTEX_HELD(&asb->asc_lock)); - - /* - * We use atomic instructions for this because we read the upper and - * lower bounds without the lock, so we need stores to be atomic. - */ - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - asb->asc_delta + asb->asc_borrowed); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - asb->asc_delta - asb->asc_borrowed); - asb->asc_delta = 0; - asb->asc_borrowed = 0; -} - -uint64_t -aggsum_value(aggsum_t *as) -{ - int64_t rv; - - mutex_enter(&as->as_lock); - if (as->as_lower_bound == as->as_upper_bound) { - rv = as->as_lower_bound; - for (int i = 0; i < as->as_numbuckets; i++) { - ASSERT0(as->as_buckets[i].asc_delta); - ASSERT0(as->as_buckets[i].asc_borrowed); - } - mutex_exit(&as->as_lock); - return (rv); - } - for (int i = 0; i < as->as_numbuckets; i++) { - struct aggsum_bucket *asb = &as->as_buckets[i]; - mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); - mutex_exit(&asb->asc_lock); - } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - rv = as->as_lower_bound; - mutex_exit(&as->as_lock); - - return (rv); -} - -void -aggsum_add(aggsum_t *as, int64_t delta) -{ - struct aggsum_bucket *asb = - &as->as_buckets[CPU_SEQID % as->as_numbuckets]; - int64_t borrow; - - /* Try fast path if we already borrowed enough before. */ - mutex_enter(&asb->asc_lock); - if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && - asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { - asb->asc_delta += delta; - mutex_exit(&asb->asc_lock); - return; - } - mutex_exit(&asb->asc_lock); - - /* - * We haven't borrowed enough. Take the global lock and borrow - * considering what is requested now and what we borrowed before. - */ - borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; - mutex_enter(&as->as_lock); - mutex_enter(&asb->asc_lock); - delta += asb->asc_delta; - asb->asc_delta = 0; - if (borrow >= asb->asc_borrowed) - borrow -= asb->asc_borrowed; - else - borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; - asb->asc_borrowed += borrow; - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - delta - borrow); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - delta + borrow); - mutex_exit(&asb->asc_lock); - mutex_exit(&as->as_lock); -} - -/* - * Compare the aggsum value to target efficiently. Returns -1 if the value - * represented by the aggsum is less than target, 1 if it's greater, and 0 if - * they are equal. - */ -int -aggsum_compare(aggsum_t *as, uint64_t target) -{ - if (as->as_upper_bound < target) - return (-1); - if (as->as_lower_bound > target) - return (1); - mutex_enter(&as->as_lock); - for (int i = 0; i < as->as_numbuckets; i++) { - struct aggsum_bucket *asb = &as->as_buckets[i]; - mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); - mutex_exit(&asb->asc_lock); - if (as->as_upper_bound < target) { - mutex_exit(&as->as_lock); - return (-1); - } - if (as->as_lower_bound > target) { - mutex_exit(&as->as_lock); - return (1); - } - } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - ASSERT3U(as->as_lower_bound, ==, target); - mutex_exit(&as->as_lock); - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c deleted file mode 100644 index 592fb02cfac1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ /dev/null @@ -1,8569 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. - */ - -/* - * DVA-based Adjustable Replacement Cache - * - * While much of the theory of operation used here is - * based on the self-tuning, low overhead replacement cache - * presented by Megiddo and Modha at FAST 2003, there are some - * significant differences: - * - * 1. The Megiddo and Modha model assumes any page is evictable. - * Pages in its cache cannot be "locked" into memory. This makes - * the eviction algorithm simple: evict the last page in the list. - * This also make the performance characteristics easy to reason - * about. Our cache is not so simple. At any given moment, some - * subset of the blocks in the cache are un-evictable because we - * have handed out a reference to them. Blocks are only evictable - * when there are no external references active. This makes - * eviction far more problematic: we choose to evict the evictable - * blocks that are the "lowest" in the list. - * - * There are times when it is not possible to evict the requested - * space. In these circumstances we are unable to adjust the cache - * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slows the flow of new data - * into the cache until we can make space available. - * - * 2. The Megiddo and Modha model assumes a fixed cache size. - * Pages are evicted when the cache is full and there is a cache - * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory pressure from the - * operating system: decreasing its size when system memory is - * tight. - * - * 3. The Megiddo and Modha model assumes a fixed page size. All - * elements of the cache are therefore exactly the same size. So - * when adjusting the cache size following a cache miss, its simply - * a matter of choosing a single page to evict. In our model, we - * have variable sized cache blocks (rangeing from 512 bytes to - * 128K bytes). We therefore choose a set of blocks to evict to make - * space for a cache miss that approximates as closely as possible - * the space used by the new block. - * - * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" - * by N. Megiddo & D. Modha, FAST 2003 - */ - -/* - * The locking model: - * - * A new reference to a cache buffer can be obtained in two - * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() interface - * uses method 1, while the internal ARC algorithms for - * adjusting the cache use method 2. We therefore provide two - * types of locks: 1) the hash table lock array, and 2) the - * ARC list locks. - * - * Buffers do not have their own mutexes, rather they rely on the - * hash table mutexes for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexes). - * - * buf_hash_find() returns the appropriate mutex (held) when it - * locates the requested buffer in the hash table. It returns - * NULL for the mutex if the buffer was not in the table. - * - * buf_hash_remove() expects the appropriate hash mutex to be - * already held before it is invoked. - * - * Each ARC state also has a mutex which is used to protect the - * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an ARC list lock you - * must use: mutex_tryenter() to avoid deadlock. Also note that - * the active state mutex must be held before the ghost state mutex. - * - * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In - * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. - * - * Note that the majority of the performance stats are manipulated - * with atomic operations. - * - * The L2ARC uses the l2ad_mtx on each vdev for the following: - * - * - L2ARC buflist creation - * - L2ARC buflist eviction - * - L2ARC write completion, which walks L2ARC buflists - * - ARC header destruction, as it removes from L2ARC buflists - * - ARC header release, as it removes from L2ARC buflists - */ - -/* - * ARC operation: - * - * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. - * This structure can point either to a block that is still in the cache or to - * one that is only accessible in an L2 ARC device, or it can provide - * information about a block that was recently evicted. If a block is - * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough - * information to retrieve it from the L2ARC device. This information is - * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block - * that is in this state cannot access the data directly. - * - * Blocks that are actively being referenced or have not been evicted - * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within - * the arc_buf_hdr_t that will point to the data block in memory. A block can - * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC - * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). - * - * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pabd) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, - * it will match its on-disk compression characteristics. This behavior can be - * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pabd will point to an - * uncompressed version of the on-disk data. - * - * Data in the L1ARC is not accessed by consumers of the ARC directly. Each - * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. - * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC - * consumer. The ARC will provide references to this data and will keep it - * cached until it is no longer in use. The ARC caches only the L1ARC's physical - * data block and will evict any arc_buf_t that is no longer referenced. The - * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the - * "overhead_size" kstat. - * - * Depending on the consumer, an arc_buf_t can be requested in uncompressed or - * compressed form. The typical case is that consumers will want uncompressed - * data, and when that happens a new data buffer is allocated where the data is - * decompressed for them to use. Currently the only consumer who wants - * compressed arc_buf_t's is "zfs send", when it streams data exactly as it - * exists on disk. When this happens, the arc_buf_t's data buffer is shared - * with the arc_buf_hdr_t. - * - * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The - * first one is owned by a compressed send consumer (and therefore references - * the same compressed data buffer as the arc_buf_hdr_t) and the second could be - * used by any other consumer (and has its own uncompressed copy of the data - * buffer). - * - * arc_buf_hdr_t - * +-----------+ - * | fields | - * | common to | - * | L1- and | - * | L2ARC | - * +-----------+ - * | l2arc_buf_hdr_t - * | | - * +-----------+ - * | l1arc_buf_hdr_t - * | | arc_buf_t - * | b_buf +------------>+-----------+ arc_buf_t - * | b_pabd +-+ |b_next +---->+-----------+ - * +-----------+ | |-----------| |b_next +-->NULL - * | |b_comp = T | +-----------+ - * | |b_data +-+ |b_comp = F | - * | +-----------+ | |b_data +-+ - * +->+------+ | +-----------+ | - * compressed | | | | - * data | |<--------------+ | uncompressed - * +------+ compressed, | data - * shared +-->+------+ - * data | | - * | | - * +------+ - * - * When a consumer reads a block, the ARC must first look to see if the - * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new - * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a - * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the - * hdr is compressed and the desired compression characteristics of the - * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the - * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be - * the last buffer in the hdr's b_buf list, however a shared compressed buf can - * be anywhere in the hdr's list. - * - * The diagram below shows an example of an uncompressed ARC hdr that is - * sharing its data with an arc_buf_t (note that the shared uncompressed buf is - * the last element in the buf list): - * - * arc_buf_hdr_t - * +-----------+ - * | | - * | | - * | | - * +-----------+ - * l2arc_buf_hdr_t| | - * | | - * +-----------+ - * l1arc_buf_hdr_t| | - * | | arc_buf_t (shared) - * | b_buf +------------>+---------+ arc_buf_t - * | | |b_next +---->+---------+ - * | b_pabd +-+ |---------| |b_next +-->NULL - * +-----------+ | | | +---------+ - * | |b_data +-+ | | - * | +---------+ | |b_data +-+ - * +->+------+ | +---------+ | - * | | | | - * uncompressed | | | | - * data +------+ | | - * ^ +->+------+ | - * | uncompressed | | | - * | data | | | - * | +------+ | - * +---------------------------------+ - * - * Writing to the ARC requires that the ARC first discard the hdr's b_pabd - * since the physical block is about to be rewritten. The new data contents - * will be contained in the arc_buf_t. As the I/O pipeline performs the write, - * it may compress the data before writing it to disk. The ARC will be called - * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pabd. Writes are always done into buffers which have - * either been loaned (and hence are new and don't have other readers) or - * buffers which have been released (and hence have their own hdr, if there - * were originally other readers of the buf's original hdr). This ensures that - * the ARC only needs to update a single buf and its hdr after a write occurs. - * - * When the L2ARC is in use, it will also take advantage of the b_pabd. The - * L2ARC will always write the contents of b_pabd to the L2ARC. This means - * that when compressed ARC is enabled that the L2ARC blocks are identical - * to the on-disk block in the main data pool. This provides a significant - * advantage since the ARC can leverage the bp's checksum when reading from the - * L2ARC to determine if the contents are valid. However, if the compressed - * ARC is disabled, then the L2ARC's block must be transformed to look - * like the physical block in the main data pool before comparing the - * checksum and determining its validity. - */ - -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/spa_impl.h> -#include <sys/zio_compress.h> -#include <sys/zio_checksum.h> -#include <sys/zfs_context.h> -#include <sys/arc.h> -#include <sys/refcount.h> -#include <sys/vdev.h> -#include <sys/vdev_impl.h> -#include <sys/dsl_pool.h> -#include <sys/zio_checksum.h> -#include <sys/multilist.h> -#include <sys/abd.h> -#ifdef _KERNEL -#include <sys/dnlc.h> -#include <sys/racct.h> -#endif -#include <sys/callb.h> -#include <sys/kstat.h> -#include <sys/trim_map.h> -#include <sys/zthr.h> -#include <zfs_fletcher.h> -#include <sys/sdt.h> -#include <sys/aggsum.h> -#include <sys/cityhash.h> - -#include <machine/vmparam.h> - -#ifdef illumos -#ifndef _KERNEL -/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ -boolean_t arc_watch = B_FALSE; -int arc_procfd; -#endif -#endif /* illumos */ - -/* - * This thread's job is to keep enough free memory in the system, by - * calling arc_kmem_reap_now() plus arc_shrink(), which improves - * arc_available_memory(). - */ -static zthr_t *arc_reap_zthr; - -/* - * This thread's job is to keep arc_size under arc_c, by calling - * arc_adjust(), which improves arc_is_overflowing(). - */ -static zthr_t *arc_adjust_zthr; - -static kmutex_t arc_adjust_lock; -static kcondvar_t arc_adjust_waiters_cv; -static boolean_t arc_adjust_needed = B_FALSE; - -static kmutex_t arc_dnlc_evicts_lock; -static kcondvar_t arc_dnlc_evicts_cv; -static boolean_t arc_dnlc_evicts_thread_exit; - -uint_t arc_reduce_dnlc_percent = 3; - -/* - * The number of headers to evict in arc_evict_state_impl() before - * dropping the sublist lock and evicting from another sublist. A lower - * value means we're more likely to evict the "correct" header (i.e. the - * oldest header in the arc state), but comes with higher overhead - * (i.e. more invocations of arc_evict_state_impl()). - */ -int zfs_arc_evict_batch_limit = 10; - -/* number of seconds before growing cache again */ -int arc_grow_retry = 60; - -/* - * Minimum time between calls to arc_kmem_reap_soon(). Note that this will - * be converted to ticks, so with the default hz=100, a setting of 15 ms - * will actually wait 2 ticks, or 20ms. - */ -int arc_kmem_cache_reap_retry_ms = 1000; - -/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; - -/* shift of arc_c for calculating both min and max arc_p */ -int arc_p_min_shift = 4; - -/* log2(fraction of arc to reclaim) */ -int arc_shrink_shift = 7; - -/* - * log2(fraction of ARC which must be free to allow growing). - * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, - * when reading a new block into the ARC, we will evict an equal-sized block - * from the ARC. - * - * This must be less than arc_shrink_shift, so that when we shrink the ARC, - * we will still not allow it to grow. - */ -int arc_no_grow_shift = 5; - - -/* - * minimum lifespan of a prefetch block in clock ticks - * (initialized in arc_init()) - */ -static int zfs_arc_min_prefetch_ms = 1; -static int zfs_arc_min_prescient_prefetch_ms = 6; - -/* - * If this percent of memory is free, don't throttle. - */ -int arc_lotsfree_percent = 10; - -static boolean_t arc_initialized; -extern boolean_t zfs_prefetch_disable; - -/* - * The arc has filled available memory and has now warmed up. - */ -static boolean_t arc_warm; - -/* - * log2 fraction of the zio arena to keep free. - */ -int arc_zio_arena_free_shift = 2; - -/* - * These tunables are for performance analysis. - */ -uint64_t zfs_arc_max; -uint64_t zfs_arc_min; -uint64_t zfs_arc_meta_limit = 0; -uint64_t zfs_arc_meta_min = 0; -uint64_t zfs_arc_dnode_limit = 0; -uint64_t zfs_arc_dnode_reduce_percent = 10; -int zfs_arc_grow_retry = 0; -int zfs_arc_shrink_shift = 0; -int zfs_arc_no_grow_shift = 0; -int zfs_arc_p_min_shift = 0; -uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ -u_int zfs_arc_free_target = 0; - -/* Absolute min for arc min / max is 16MB. */ -static uint64_t arc_abs_min = 16 << 20; - -/* - * ARC dirty data constraints for arc_tempreserve_space() throttle - */ -uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ -uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ -uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ - -boolean_t zfs_compressed_arc_enabled = B_TRUE; - -static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); - -#if defined(__FreeBSD__) && defined(_KERNEL) -static void -arc_free_target_init(void *unused __unused) -{ - - zfs_arc_free_target = vm_cnt.v_free_target; -} -SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, - arc_free_target_init, NULL); - -TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); -TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); -TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); -TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); -TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); -SYSCTL_DECL(_vfs_zfs); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_U32 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", - "log2(fraction of ARC which must be free to allow growing)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, - &zfs_arc_average_blocksize, 0, - "ARC average blocksize"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, - &arc_shrink_shift, 0, - "log2(fraction of arc to reclaim)"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, - &arc_grow_retry, 0, - "Wait in seconds before considering growing ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, - &zfs_compressed_arc_enabled, 0, - "Enable compressed ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, - &arc_kmem_cache_reap_retry_ms, 0, - "Interval between ARC kmem_cache reapings"); - -/* - * We don't have a tunable for arc_free_target due to the dependency on - * pagedaemon initialisation. - */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), - sysctl_vfs_zfs_arc_free_target, "IU", - "Desired number of free pages below which ARC triggers reclaim"); - -static int -sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) -{ - u_int val; - int err; - - val = zfs_arc_free_target; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < minfree) - return (EINVAL); - if (val > vm_cnt.v_page_count) - return (EINVAL); - - zfs_arc_free_target = val; - - return (0); -} - -/* - * Must be declared here, before the definition of corresponding kstat - * macro which uses the same names will confuse the compiler. - */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), - sysctl_vfs_zfs_arc_meta_limit, "QU", - "ARC metadata limit"); -#endif - -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - /* - * list of evictable buffers - */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; - /* - * total amount of evictable data in this state - */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; - /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. - */ - zfs_refcount_t arcs_size; - /* - * supports the "dbufs" kstat - */ - arc_state_type_t arcs_state; -} arc_state_t; - -/* - * Percentage that can be consumed by dnodes of ARC meta buffers. - */ -int zfs_arc_meta_prune = 10000; -unsigned long zfs_arc_dnode_limit_percent = 10; -int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY; -int zfs_arc_meta_adjust_restarts = 4096; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN, - &zfs_arc_meta_strategy, 0, - "ARC metadata reclamation strategy " - "(0 = metadata only, 1 = balance data and metadata)"); - -/* The 6 states: */ -static arc_state_t ARC_anon; -static arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -static arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; - -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_allocated; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach it's target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by dmu_buf_impl_t objects. - */ - kstat_named_t arcstat_dbuf_size; - /* - * Number of bytes consumed by dnode_t objects. - */ - kstat_named_t arcstat_dnode_size; - /* - * Number of bytes consumed by bonus buffers. - */ - kstat_named_t arcstat_bonus_size; -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - /* - * Sum of the previous three counters, provided for compatibility. - */ - kstat_named_t arcstat_other_size; -#endif - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_write_trylock_fail; - kstat_named_t arcstat_l2_write_passed_headroom; - kstat_named_t arcstat_l2_write_spa_mismatch; - kstat_named_t arcstat_l2_write_in_l2; - kstat_named_t arcstat_l2_write_hdr_io_in_progress; - kstat_named_t arcstat_l2_write_not_cacheable; - kstat_named_t arcstat_l2_write_full; - kstat_named_t arcstat_l2_write_buffer_iter; - kstat_named_t arcstat_l2_write_pios; - kstat_named_t arcstat_l2_write_buffer_bytes_scanned; - kstat_named_t arcstat_l2_write_buffer_list_iter; - kstat_named_t arcstat_l2_write_buffer_list_null_iter; - kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_memory_direct_count; - kstat_named_t arcstat_memory_indirect_count; - kstat_named_t arcstat_memory_all_bytes; - kstat_named_t arcstat_memory_free_bytes; - kstat_named_t arcstat_memory_available_bytes; - kstat_named_t arcstat_no_grow; - kstat_named_t arcstat_tempreserve; - kstat_named_t arcstat_loaned_bytes; - kstat_named_t arcstat_prune; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; -} arc_stats_t; - -static arc_stats_t arc_stats = { - { "hits", KSTAT_DATA_UINT64 }, - { "misses", KSTAT_DATA_UINT64 }, - { "demand_data_hits", KSTAT_DATA_UINT64 }, - { "demand_data_misses", KSTAT_DATA_UINT64 }, - { "demand_metadata_hits", KSTAT_DATA_UINT64 }, - { "demand_metadata_misses", KSTAT_DATA_UINT64 }, - { "prefetch_data_hits", KSTAT_DATA_UINT64 }, - { "prefetch_data_misses", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, - { "mru_hits", KSTAT_DATA_UINT64 }, - { "mru_ghost_hits", KSTAT_DATA_UINT64 }, - { "mfu_hits", KSTAT_DATA_UINT64 }, - { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, - { "allocated", KSTAT_DATA_UINT64 }, - { "deleted", KSTAT_DATA_UINT64 }, - { "mutex_miss", KSTAT_DATA_UINT64 }, - { "access_skip", KSTAT_DATA_UINT64 }, - { "evict_skip", KSTAT_DATA_UINT64 }, - { "evict_not_enough", KSTAT_DATA_UINT64 }, - { "evict_l2_cached", KSTAT_DATA_UINT64 }, - { "evict_l2_eligible", KSTAT_DATA_UINT64 }, - { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, - { "evict_l2_skip", KSTAT_DATA_UINT64 }, - { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, - { "hash_collisions", KSTAT_DATA_UINT64 }, - { "hash_chains", KSTAT_DATA_UINT64 }, - { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, - { "c", KSTAT_DATA_UINT64 }, - { "c_min", KSTAT_DATA_UINT64 }, - { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 }, - { "compressed_size", KSTAT_DATA_UINT64 }, - { "uncompressed_size", KSTAT_DATA_UINT64 }, - { "overhead_size", KSTAT_DATA_UINT64 }, - { "hdr_size", KSTAT_DATA_UINT64 }, - { "data_size", KSTAT_DATA_UINT64 }, - { "metadata_size", KSTAT_DATA_UINT64 }, - { "dbuf_size", KSTAT_DATA_UINT64 }, - { "dnode_size", KSTAT_DATA_UINT64 }, - { "bonus_size", KSTAT_DATA_UINT64 }, -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - { "other_size", KSTAT_DATA_UINT64 }, -#endif - { "anon_size", KSTAT_DATA_UINT64 }, - { "anon_evictable_data", KSTAT_DATA_UINT64 }, - { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mru_size", KSTAT_DATA_UINT64 }, - { "mru_evictable_data", KSTAT_DATA_UINT64 }, - { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mru_ghost_size", KSTAT_DATA_UINT64 }, - { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, - { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mfu_size", KSTAT_DATA_UINT64 }, - { "mfu_evictable_data", KSTAT_DATA_UINT64 }, - { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mfu_ghost_size", KSTAT_DATA_UINT64 }, - { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, - { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, - { "l2_hits", KSTAT_DATA_UINT64 }, - { "l2_misses", KSTAT_DATA_UINT64 }, - { "l2_feeds", KSTAT_DATA_UINT64 }, - { "l2_rw_clash", KSTAT_DATA_UINT64 }, - { "l2_read_bytes", KSTAT_DATA_UINT64 }, - { "l2_write_bytes", KSTAT_DATA_UINT64 }, - { "l2_writes_sent", KSTAT_DATA_UINT64 }, - { "l2_writes_done", KSTAT_DATA_UINT64 }, - { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_evict_reading", KSTAT_DATA_UINT64 }, - { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, - { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, - { "l2_cksum_bad", KSTAT_DATA_UINT64 }, - { "l2_io_error", KSTAT_DATA_UINT64 }, - { "l2_size", KSTAT_DATA_UINT64 }, - { "l2_asize", KSTAT_DATA_UINT64 }, - { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, - { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, - { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, - { "l2_write_in_l2", KSTAT_DATA_UINT64 }, - { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, - { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, - { "l2_write_full", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, - { "l2_write_pios", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "memory_direct_count", KSTAT_DATA_UINT64 }, - { "memory_indirect_count", KSTAT_DATA_UINT64 }, - { "memory_all_bytes", KSTAT_DATA_UINT64 }, - { "memory_free_bytes", KSTAT_DATA_UINT64 }, - { "memory_available_bytes", KSTAT_DATA_UINT64 }, - { "arc_no_grow", KSTAT_DATA_UINT64 }, - { "arc_tempreserve", KSTAT_DATA_UINT64 }, - { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, - { "arc_prune", KSTAT_DATA_UINT64 }, - { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, - { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "async_upgrade_sync", KSTAT_DATA_UINT64 }, - { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, - { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, -}; - -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - -#define ARCSTAT_MAX(stat, val) { \ - uint64_t m; \ - while ((val) > (m = arc_stats.stat.value.ui64) && \ - (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ - continue; \ -} - -#define ARCSTAT_MAXSTAT(stat) \ - ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) - -/* - * We define a macro to allow ARC hits/misses to be easily broken down by - * two separate conditions, giving a total of four different subtypes for - * each of hits and misses (so eight statistics total). - */ -#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ - if (cond1) { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ - } \ - } else { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ - } \ - } - -kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu; -static arc_state_t *arc_mfu_ghost; -static arc_state_t *arc_l2c_only; - -/* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */ -#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */ -#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */ - -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* - * There are also some ARC variables that we want to export, but that are - * updated so often that having the canonical representation be the statistic - * variable causes a performance bottleneck. We want to use aggsum_t's for these - * instead, but still be able to export the kstat in the same way as before. - * The solution is to always use the aggsum version, except in the kstat update - * callback. - */ -aggsum_t arc_size; -aggsum_t arc_meta_used; -aggsum_t astat_data_size; -aggsum_t astat_metadata_size; -aggsum_t astat_hdr_size; -aggsum_t astat_bonus_size; -aggsum_t astat_dnode_size; -aggsum_t astat_dbuf_size; -aggsum_t astat_l2_hdr_size; - -static list_t arc_prune_list; -static kmutex_t arc_prune_mtx; -static taskq_t *arc_prune_taskq; - -static int arc_no_grow; /* Don't try to grow cache size */ -static hrtime_t arc_growtime; -static uint64_t arc_tempreserve; -static uint64_t arc_loaned_bytes; - -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_read_done_func_t *acb_done; - arc_buf_t *acb_buf; - boolean_t acb_compressed; - zio_t *acb_zio_dummy; - zio_t *acb_zio_head; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_write_done_func_t *awcb_ready; - arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; - arc_write_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -/* - * ARC buffers are separated into multiple structs as a memory saving measure: - * - Common fields struct, always defined, and embedded within it: - * - L2-only fields, always allocated but undefined when not in L2ARC - * - L1-only fields, only allocated when in L1ARC - * - * Buffer in L1 Buffer only in L2 - * +------------------------+ +------------------------+ - * | arc_buf_hdr_t | | arc_buf_hdr_t | - * | | | | - * | | | | - * | | | | - * +------------------------+ +------------------------+ - * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | - * | (undefined if L1-only) | | | - * +------------------------+ +------------------------+ - * | l1arc_buf_hdr_t | - * | | - * | | - * | | - * | | - * +------------------------+ - * - * Because it's possible for the L2ARC to become extremely large, we can wind - * up eating a lot of memory in L2ARC buffer headers, so the size of a header - * is minimized by only allocating the fields necessary for an L1-cached buffer - * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and - * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple - * words in pointers. arc_hdr_realloc() is used to switch a header between - * these two allocation states. - */ -typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; -#ifdef ZFS_DEBUG - /* - * Used for debugging with kmem_flags - by allocating and freeing - * b_thawed when the buffer is thawed, we get a record of the stack - * trace that thawed it. - */ - void *b_thawed; -#endif - - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - - /* protected by arc state mutex */ - arc_state_t *b_state; - multilist_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - uint32_t b_mru_hits; - uint32_t b_mru_ghost_hits; - uint32_t b_mfu_hits; - uint32_t b_mfu_ghost_hits; - uint32_t b_l2_hits; - - /* self protecting */ - zfs_refcount_t b_refcnt; - - arc_callback_t *b_acb; - abd_t *b_pabd; -} l1arc_buf_hdr_t; - -typedef struct l2arc_dev l2arc_dev_t; - -typedef struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - uint32_t b_hits; - - list_node_t b_l2node; -} l2arc_buf_hdr_t; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - - arc_buf_contents_t b_type; - arc_buf_hdr_t *b_hash_next; - arc_flags_t b_flags; - - /* - * This field stores the size of the data buffer after - * compression, and is set in the arc's zio completion handlers. - * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). - * - * While the block pointers can store up to 32MB in their psize - * field, we can only store up to 32MB minus 512B. This is due - * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. - * a field of zeros represents 512B in the bp). We can't use a - * bias of 1 since we need to reserve a psize of zero, here, to - * represent holes and embedded blocks. - * - * This isn't a problem in practice, since the maximum size of a - * buffer is limited to 16MB, so we never need to store 32MB in - * this field. Even in the upstream illumos code base, the - * maximum size of a buffer is limited to 16MB. - */ - uint16_t b_psize; - - /* - * This field stores the size of the data buffer before - * compression, and cannot change once set. It is in units - * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) - */ - uint16_t b_lsize; /* immutable */ - uint64_t b_spa; /* immutable */ - - /* L2ARC fields. Undefined when not in L2ARC. */ - l2arc_buf_hdr_t b_l2hdr; - /* L1ARC fields. Undefined when in l2arc_only state */ - l1arc_buf_hdr_t b_l1hdr; -}; - -#if defined(__FreeBSD__) && defined(_KERNEL) -static int -sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = arc_meta_limit; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val <= 0 || val > arc_c_max) - return (EINVAL); - - arc_meta_limit = val; - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - - return (0); -} - -static int -sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) -{ - uint32_t val; - int err; - - val = arc_no_grow_shift; - err = sysctl_handle_32(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val >= arc_shrink_shift) - return (EINVAL); - - arc_no_grow_shift = val; - return (0); -} - -static int -sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_max; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (zfs_arc_max == 0) { - /* Loader tunable so blindly set */ - zfs_arc_max = val; - return (0); - } - - if (val < arc_abs_min || val > kmem_size()) - return (EINVAL); - if (val < arc_c_min) - return (EINVAL); - if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) - return (EINVAL); - - arc_c_max = val; - - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - if (zfs_arc_meta_limit == 0) { - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; - } - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - - zfs_arc_max = arc_c; - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - - return (0); -} - -static int -sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_min; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (zfs_arc_min == 0) { - /* Loader tunable so blindly set */ - zfs_arc_min = val; - return (0); - } - - if (val < arc_abs_min || val > arc_c_max) - return (EINVAL); - - arc_c_min = val; - - if (zfs_arc_meta_min == 0) - arc_meta_min = arc_c_min / 2; - - if (arc_c < arc_c_min) - arc_c = arc_c_min; - - zfs_arc_min = arc_c_min; - - return (0); -} -#endif - -#define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ - (state) == arc_l2c_only) - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_PRESCIENT_PREFETCH(hdr) \ - ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) -#define HDR_COMPRESSION_ENABLED(hdr) \ - ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) - -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) -#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) - -#define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) -#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) - -#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) -#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) - -/* For storing compression mode in b_flags */ -#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) - -#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ - HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) -#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ - HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); - -#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) -#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) -#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) - -/* - * Other sizes - */ - -#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) - -/* - * Hash table routines - */ - -#define HT_LOCK_PAD CACHE_LINE_SIZE - -struct ht_lock { - kmutex_t ht_lock; -#ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; -#endif -}; - -#define BUF_LOCKS 256 -typedef struct buf_hash_table { - uint64_t ht_mask; - arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); -} buf_hash_table_t; - -static buf_hash_table_t buf_hash_table; - -#define BUF_HASH_INDEX(spa, dva, birth) \ - (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) -#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(hdr) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) - -uint64_t zfs_crc64_table[256]; - -/* - * Level 2 ARC - */ - -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ -/* - * If we discover during ARC scan any buffers to be compressed, we boost - * our headroom for the next scanning cycle by this percentage multiple. - */ -#define L2ARC_HEADROOM_BOOST 200 -#define L2ARC_FEED_SECS 1 /* caching interval secs */ -#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ - -#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) -#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) - -/* L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ -uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ -boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, - &l2arc_write_max, 0, "max write size"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, - &l2arc_write_boost, 0, "extra write during warmup"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, - &l2arc_headroom, 0, "number of dev writes"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, - &l2arc_feed_secs, 0, "interval seconds"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, - &l2arc_feed_min_ms, 0, "min interval milliseconds"); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, - &l2arc_noprefetch, 0, "don't cache prefetch bufs"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, - &l2arc_feed_again, 0, "turbo warmup"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, - &l2arc_norw, 0, "no reads during writes"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of anonymous state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); - -SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, - &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, - &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); - -/* - * L2ARC Internals - */ -struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -}; - -static list_t L2ARC_dev_list; /* device list */ -static list_t *l2arc_dev_list; /* device list pointer */ -static kmutex_t l2arc_dev_mtx; /* device list mutex */ -static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static list_t L2ARC_free_on_write; /* free after write buf list */ -static list_t *l2arc_free_on_write; /* free after write list ptr */ -static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ -static uint64_t l2arc_ndev; /* number of devices */ - -typedef struct l2arc_read_callback { - arc_buf_hdr_t *l2rcb_hdr; /* read header */ - blkptr_t l2rcb_bp; /* original blkptr */ - zbookmark_phys_t l2rcb_zb; /* original bookmark */ - int l2rcb_flags; /* original flags */ - abd_t *l2rcb_abd; /* temporary buffer */ -} l2arc_read_callback_t; - -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; - -typedef struct l2arc_data_free { - /* protected by l2arc_free_on_write_mtx */ - abd_t *l2df_abd; - size_t l2df_size; - arc_buf_contents_t l2df_type; - list_node_t l2df_list_node; -} l2arc_data_free_t; - -static kmutex_t l2arc_feed_thr_lock; -static kcondvar_t l2arc_feed_thr_cv; -static uint8_t l2arc_thread_exit; - -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); -static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); -static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); -static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); -static void arc_hdr_free_pabd(arc_buf_hdr_t *); -static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static boolean_t arc_is_overflowing(); -static void arc_buf_watch(arc_buf_t *); -static void arc_prune_async(int64_t); - -static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); -static uint32_t arc_bufc_to_flags(arc_buf_contents_t); -static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); -static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); - -static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); -static void l2arc_read_done(zio_t *); - -static void -l2arc_trim(const arc_buf_hdr_t *hdr) -{ - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - - if (HDR_GET_PSIZE(hdr) != 0) { - trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, - HDR_GET_PSIZE(hdr), 0); - } -} - -/* - * We use Cityhash for this. It's fast, and has good hash properties without - * requiring any large static buffers. - */ -static uint64_t -buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) -{ - return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); -} - -#define HDR_EMPTY(hdr) \ - ((hdr)->b_dva.dva_word[0] == 0 && \ - (hdr)->b_dva.dva_word[1] == 0) - -#define HDR_EQUAL(spa, dva, birth, hdr) \ - ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) - -static void -buf_discard_identity(arc_buf_hdr_t *hdr) -{ - hdr->b_dva.dva_word[0] = 0; - hdr->b_dva.dva_word[1] = 0; - hdr->b_birth = 0; -} - -static arc_buf_hdr_t * -buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) -{ - const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_PHYSICAL_BIRTH(bp); - uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *hdr; - - mutex_enter(hash_lock); - for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; - hdr = hdr->b_hash_next) { - if (HDR_EQUAL(spa, dva, birth, hdr)) { - *lockp = hash_lock; - return (hdr); - } - } - mutex_exit(hash_lock); - *lockp = NULL; - return (NULL); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - * If lockp == NULL, the caller is assumed to already hold the hash lock. - */ -static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) -{ - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fhdr; - uint32_t i; - - ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); - ASSERT(hdr->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - - if (lockp != NULL) { - *lockp = hash_lock; - mutex_enter(hash_lock); - } else { - ASSERT(MUTEX_HELD(hash_lock)); - } - - for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; - fhdr = fhdr->b_hash_next, i++) { - if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) - return (fhdr); - } - - hdr->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = hdr; - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - - /* collect some hash table performance data */ - if (i > 0) { - ARCSTAT_BUMP(arcstat_hash_collisions); - if (i == 1) - ARCSTAT_BUMP(arcstat_hash_chains); - - ARCSTAT_MAX(arcstat_hash_chain_max, i); - } - - ARCSTAT_BUMP(arcstat_hash_elements); - ARCSTAT_MAXSTAT(arcstat_hash_elements); - - return (NULL); -} - -static void -buf_hash_remove(arc_buf_hdr_t *hdr) -{ - arc_buf_hdr_t *fhdr, **hdrp; - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - - ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - - hdrp = &buf_hash_table.ht_table[idx]; - while ((fhdr = *hdrp) != hdr) { - ASSERT3P(fhdr, !=, NULL); - hdrp = &fhdr->b_hash_next; - } - *hdrp = hdr->b_hash_next; - hdr->b_hash_next = NULL; - arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - - /* collect some hash table performance data */ - ARCSTAT_BUMPDOWN(arcstat_hash_elements); - - if (buf_hash_table.ht_table[idx] && - buf_hash_table.ht_table[idx]->b_hash_next == NULL) - ARCSTAT_BUMPDOWN(arcstat_hash_chains); -} - -/* - * Global data structures and functions for the buf kmem cache. - */ -static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_l2only_cache; -static kmem_cache_t *buf_cache; - -static void -buf_fini(void) -{ - int i; - - kmem_free(buf_hash_table.ht_table, - (buf_hash_table.ht_mask + 1) * sizeof (void *)); - for (i = 0; i < BUF_LOCKS; i++) - mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_l2only_cache); - kmem_cache_destroy(buf_cache); -} - -/* - * Constructor callback - called when the cache is empty - * and a new buf is requested. - */ -/* ARGSUSED */ -static int -hdr_full_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *hdr = vbuf; - - bzero(hdr, HDR_FULL_SIZE); - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); - zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); - mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - multilist_link_init(&hdr->b_l1hdr.b_arc_node); - arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); - - return (0); -} - -/* ARGSUSED */ -static int -hdr_l2only_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *hdr = vbuf; - - bzero(hdr, HDR_L2ONLY_SIZE); - arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); - - return (0); -} - -/* ARGSUSED */ -static int -buf_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_t *buf = vbuf; - - bzero(buf, sizeof (arc_buf_t)); - mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); - arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); - - return (0); -} - -/* - * Destructor callback - called when a cached buf is - * no longer required. - */ -/* ARGSUSED */ -static void -hdr_full_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *hdr = vbuf; - - ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); - zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); - mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); -} - -/* ARGSUSED */ -static void -hdr_l2only_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *hdr = vbuf; - - ASSERT(HDR_EMPTY(hdr)); - arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); -} - -/* ARGSUSED */ -static void -buf_dest(void *vbuf, void *unused) -{ - arc_buf_t *buf = vbuf; - - mutex_destroy(&buf->b_evict_lock); - arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); -} - -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (arc_initialized) - zthr_wakeup(arc_reap_zthr); -} - -static void -buf_init(void) -{ - uint64_t *ct; - uint64_t hsize = 1ULL << 12; - int i, j; - - /* - * The hash table is big enough to fill all of physical memory - * with an average block size of zfs_arc_average_blocksize (default 8K). - * By default, the table will take up - * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). - */ - while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; -retry: - buf_hash_table.ht_mask = hsize - 1; - buf_hash_table.ht_table = - kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); - if (buf_hash_table.ht_table == NULL) { - ASSERT(hsize > (1ULL << 8)); - hsize >>= 1; - goto retry; - } - - hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); - hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", - HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, - NULL, NULL, 0); - buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < 256; i++) - for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) - *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } -} - -/* - * This is the size that the buf occupies in memory. If the buf is compressed, - * it will correspond to the compressed size. You should use this method of - * getting the buf size unless you explicitly need the logical size. - */ -int32_t -arc_buf_size(arc_buf_t *buf) -{ - return (ARC_BUF_COMPRESSED(buf) ? - HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); -} - -int32_t -arc_buf_lsize(arc_buf_t *buf) -{ - return (HDR_GET_LSIZE(buf->b_hdr)); -} - -enum zio_compress -arc_get_compression(arc_buf_t *buf) -{ - return (ARC_BUF_COMPRESSED(buf) ? - HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); -} - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - -static inline boolean_t -arc_buf_is_shared(arc_buf_t *buf) -{ - boolean_t shared = (buf->b_data != NULL && - buf->b_hdr->b_l1hdr.b_pabd != NULL && - abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && - buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); - IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); - IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); - - /* - * It would be nice to assert arc_can_share() too, but the "hdr isn't - * already being shared" requirement prevents us from doing that. - */ - - return (shared); -} - -/* - * Free the checksum associated with this header. If there is no checksum, this - * is a no-op. - */ -static inline void -arc_cksum_free(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_l1hdr.b_freeze_cksum = NULL; - } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -} - -/* - * Return true iff at least one of the bufs on hdr is not compressed. - */ -static boolean_t -arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) -{ - for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { - if (!ARC_BUF_COMPRESSED(b)) { - return (B_TRUE); - } - } - return (B_FALSE); -} - -/* - * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data - * matches the checksum that is stored in the hdr. If there is no checksum, - * or if the buf is compressed, this is a no-op. - */ -static void -arc_cksum_verify(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - zio_cksum_t zc; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - ASSERT(HDR_HAS_L1HDR(hdr)); - - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } - - fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) - panic("buffer modified while frozen!"); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -} - -static boolean_t -arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) -{ - enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); - boolean_t valid_cksum; - - ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); - VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); - - /* - * We rely on the blkptr's checksum to determine if the block - * is valid or not. When compressed arc is enabled, the l2arc - * writes the block to the l2arc just as it appears in the pool. - * This allows us to use the blkptr's checksum to validate the - * data that we just read off of the l2arc without having to store - * a separate checksum in the arc_buf_hdr_t. However, if compressed - * arc is disabled, then the data written to the l2arc is always - * uncompressed and won't match the block as it exists in the main - * pool. When this is the case, we must first compress it if it is - * compressed on the main pool before we can validate the checksum. - */ - if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - uint64_t lsize = HDR_GET_LSIZE(hdr); - uint64_t csize; - - abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); - csize = zio_compress_data(compress, zio->io_abd, - abd_to_buf(cdata), lsize); - - ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); - if (csize < HDR_GET_PSIZE(hdr)) { - /* - * Compressed blocks are always a multiple of the - * smallest ashift in the pool. Ideally, we would - * like to round up the csize to the next - * spa_min_ashift but that value may have changed - * since the block was last written. Instead, - * we rely on the fact that the hdr's psize - * was set to the psize of the block when it was - * last written. We set the csize to that value - * and zero out any part that should not contain - * data. - */ - abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); - csize = HDR_GET_PSIZE(hdr); - } - zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); - } - - /* - * Block pointers always store the checksum for the logical data. - * If the block pointer has the gang bit set, then the checksum - * it represents is for the reconstituted data and not for an - * individual gang member. The zio pipeline, however, must be able to - * determine the checksum of each of the gang constituents so it - * treats the checksum comparison differently than what we need - * for l2arc blocks. This prevents us from using the - * zio_checksum_error() interface directly. Instead we must call the - * zio_checksum_error_impl() so that we can ensure the checksum is - * generated using the correct checksum algorithm and accounts for the - * logical I/O size and not just a gang fragment. - */ - valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, - zio->io_offset, NULL) == 0); - zio_pop_transforms(zio); - return (valid_cksum); -} - -/* - * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a - * checksum and attaches it to the buf's hdr so that we can ensure that the buf - * isn't modified later on. If buf is compressed or there is already a checksum - * on the hdr, this is a no-op (we only checksum uncompressed bufs). - */ -static void -arc_cksum_compute(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - ASSERT(HDR_HAS_L1HDR(hdr)); - - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - ASSERT(arc_hdr_has_uncompressed_buf(hdr)); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } else if (ARC_BUF_COMPRESSED(buf)) { - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } - - ASSERT(!ARC_BUF_COMPRESSED(buf)); - hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), - KM_SLEEP); - fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, - hdr->b_l1hdr.b_freeze_cksum); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -#ifdef illumos - arc_buf_watch(buf); -#endif -} - -#ifdef illumos -#ifndef _KERNEL -typedef struct procctl { - long cmd; - prwatch_t prwatch; -} procctl_t; -#endif - -/* ARGSUSED */ -static void -arc_buf_unwatch(arc_buf_t *buf) -{ -#ifndef _KERNEL - if (arc_watch) { - int result; - procctl_t ctl; - ctl.cmd = PCWATCH; - ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = 0; - ctl.prwatch.pr_wflags = 0; - result = write(arc_procfd, &ctl, sizeof (ctl)); - ASSERT3U(result, ==, sizeof (ctl)); - } -#endif -} - -/* ARGSUSED */ -static void -arc_buf_watch(arc_buf_t *buf) -{ -#ifndef _KERNEL - if (arc_watch) { - int result; - procctl_t ctl; - ctl.cmd = PCWATCH; - ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = arc_buf_size(buf); - ctl.prwatch.pr_wflags = WA_WRITE; - result = write(arc_procfd, &ctl, sizeof (ctl)); - ASSERT3U(result, ==, sizeof (ctl)); - } -#endif -} -#endif /* illumos */ - -static arc_buf_contents_t -arc_buf_type(arc_buf_hdr_t *hdr) -{ - arc_buf_contents_t type; - if (HDR_ISTYPE_METADATA(hdr)) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - } - VERIFY3U(hdr->b_type, ==, type); - return (type); -} - -boolean_t -arc_is_metadata(arc_buf_t *buf) -{ - return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); -} - -static uint32_t -arc_bufc_to_flags(arc_buf_contents_t type) -{ - switch (type) { - case ARC_BUFC_DATA: - /* metadata field is 0 if buffer contains normal data */ - return (0); - case ARC_BUFC_METADATA: - return (ARC_FLAG_BUFC_METADATA); - default: - break; - } - panic("undefined ARC buffer type!"); - return ((uint32_t)-1); -} - -void -arc_buf_thaw(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - - arc_cksum_verify(buf); - - /* - * Compressed buffers do not manipulate the b_freeze_cksum or - * allocate b_thawed. - */ - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - ASSERT(HDR_HAS_L1HDR(hdr)); - arc_cksum_free(hdr); - - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (hdr->b_l1hdr.b_thawed != NULL) - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); - } -#endif - - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - -#ifdef illumos - arc_buf_unwatch(buf); -#endif -} - -void -arc_buf_freeze(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || - hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf); - mutex_exit(hash_lock); -} - -/* - * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, - * the following functions should be used to ensure that the flags are - * updated in a thread-safe way. When manipulating the flags either - * the hash_lock must be held or the hdr must be undiscoverable. This - * ensures that we're not racing with any other threads when updating - * the flags. - */ -static inline void -arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - hdr->b_flags |= flags; -} - -static inline void -arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - hdr->b_flags &= ~flags; -} - -/* - * Setting the compression bits in the arc_buf_hdr_t's b_flags is - * done in a special way since we have to clear and set bits - * at the same time. Consumers that wish to set the compression bits - * must use this function to ensure that the flags are updated in - * thread-safe manner. - */ -static void -arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Holes and embedded blocks will always have a psize = 0 so - * we ignore the compression of the blkptr and set the - * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. - * Holes and embedded blocks remain anonymous so we don't - * want to uncompress them. Mark them as uncompressed. - */ - if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { - arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else { - arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, cmp); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); - ASSERT(HDR_COMPRESSION_ENABLED(hdr)); - } -} - -/* - * Looks for another buf on the same hdr which has the data decompressed, copies - * from it, and returns true. If no such buf exists, returns false. - */ -static boolean_t -arc_buf_try_copy_decompressed_data(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t copied = B_FALSE; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(!ARC_BUF_COMPRESSED(buf)); - - for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; - from = from->b_next) { - /* can't use our own data buffer */ - if (from == buf) { - continue; - } - - if (!ARC_BUF_COMPRESSED(from)) { - bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); - copied = B_TRUE; - break; - } - } - - /* - * There were no decompressed bufs, so there should not be a - * checksum on the hdr either. - */ - EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); - - return (copied); -} - -/* - * Given a buf that has a data buffer attached to it, this function will - * efficiently fill the buf with data of the specified compression setting from - * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr - * are already sharing a data buf, no copy is performed. - * - * If the buf is marked as compressed but uncompressed data was requested, this - * will allocate a new data buffer for the buf, remove that flag, and fill the - * buf with uncompressed data. You can't request a compressed buf on a hdr with - * uncompressed data, and (since we haven't added support for it yet) if you - * want compressed data your buf must already be marked as compressed and have - * the correct-sized data buffer. - */ -static int -arc_buf_fill(arc_buf_t *buf, boolean_t compressed) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; - - ASSERT3P(buf->b_data, !=, NULL); - IMPLY(compressed, hdr_compressed); - IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); - - if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { - abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, - arc_buf_size(buf)); - } - } else { - ASSERT(hdr_compressed); - ASSERT(!compressed); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); - - /* - * If the buf is sharing its data with the hdr, unlink it and - * allocate a new data buffer for the buf. - */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_COMPRESSED(buf)); - - /* We need to give the buf it's own b_data */ - buf->b_flags &= ~ARC_BUF_FLAG_SHARED; - buf->b_data = - arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - - /* Previously overhead was 0; just add new overhead */ - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - } else if (ARC_BUF_COMPRESSED(buf)) { - /* We need to reallocate the buf's b_data */ - arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), - buf); - buf->b_data = - arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - - /* We increased the size of b_data; update overhead */ - ARCSTAT_INCR(arcstat_overhead_size, - HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); - } - - /* - * Regardless of the buf's previous compression settings, it - * should not be compressed at the end of this function. - */ - buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - - /* - * Try copying the data from another buf which already has a - * decompressed version. If that's not possible, it's time to - * bite the bullet and decompress the data from the hdr. - */ - if (arc_buf_try_copy_decompressed_data(buf)) { - /* Skip byteswapping and checksumming (already done) */ - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); - return (0); - } else { - int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, buf->b_data, - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); - - /* - * Absent hardware errors or software bugs, this should - * be impossible, but log it anyway so we can debug it. - */ - if (error != 0) { - zfs_dbgmsg( - "hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); - return (SET_ERROR(EIO)); - } - } - } - - /* Byteswap the buf's data if necessary */ - if (bswap != DMU_BSWAP_NUMFUNCS) { - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); - dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); - } - - /* Compute the hdr's checksum if necessary */ - arc_cksum_compute(buf); - - return (0); -} - -int -arc_decompress(arc_buf_t *buf) -{ - return (arc_buf_fill(buf, B_FALSE)); -} - -/* - * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. - */ -static uint64_t -arc_hdr_size(arc_buf_hdr_t *hdr) -{ - uint64_t size; - - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - HDR_GET_PSIZE(hdr) > 0) { - size = HDR_GET_PSIZE(hdr); - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); - size = HDR_GET_LSIZE(hdr); - } - return (size); -} - -/* - * Increment the amount of evictable space in the arc_state_t's refcount. - * We account for the space used by the hdr and the arc buf individually - * so that we can add and remove them from the refcount individually. - */ -static void -arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - (void) zfs_refcount_add_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), hdr); - return; - } - - ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_add_many(&state->arcs_esize[type], - arc_hdr_size(hdr), hdr); - } - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - if (arc_buf_is_shared(buf)) - continue; - (void) zfs_refcount_add_many(&state->arcs_esize[type], - arc_buf_size(buf), buf); - } -} - -/* - * Decrement the amount of evictable space in the arc_state_t's refcount. - * We account for the space used by the hdr and the arc buf individually - * so that we can add and remove them from the refcount individually. - */ -static void -arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), hdr); - return; - } - - ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - arc_hdr_size(hdr), hdr); - } - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - if (arc_buf_is_shared(buf)) - continue; - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - arc_buf_size(buf), buf); - } -} - -/* - * Add a reference to this hdr indicating that someone is actively - * referencing that memory. When the refcount transitions from 0 to 1, - * we remove it from the respective arc_state_t list to indicate that - * it is not evictable. - */ -static void -add_reference(arc_buf_hdr_t *hdr, void *tag) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - if (!MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - } - - arc_state_t *state = hdr->b_l1hdr.b_state; - - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { - /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - } -} - -/* - * Remove a reference from this hdr. When the reference transitions from - * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's - * list making it eligible for eviction. - */ -static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) -{ - int cnt; - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); - ASSERT(!GHOST_STATE(state)); - - /* - * arc_l2c_only counts as a ghost state so we don't need to explicitly - * check to prevent usage of the arc_l2c_only list. - */ - if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && - (state != arc_anon)) { - multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - arc_evictable_space_increment(hdr, state); - } - return (cnt); -} - -/* - * Returns detailed information about a specific arc buffer. When the - * state_index argument is set the function will calculate the arc header - * list position for its arc state. Since this requires a linear traversal - * callers are strongly encourage not to do this. However, it can be helpful - * for targeted analysis so the functionality is provided. - */ -void -arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) -{ - arc_buf_hdr_t *hdr = ab->b_hdr; - l1arc_buf_hdr_t *l1hdr = NULL; - l2arc_buf_hdr_t *l2hdr = NULL; - arc_state_t *state = NULL; - - memset(abi, 0, sizeof (arc_buf_info_t)); - - if (hdr == NULL) - return; - - abi->abi_flags = hdr->b_flags; - - if (HDR_HAS_L1HDR(hdr)) { - l1hdr = &hdr->b_l1hdr; - state = l1hdr->b_state; - } - if (HDR_HAS_L2HDR(hdr)) - l2hdr = &hdr->b_l2hdr; - - if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; - abi->abi_access = l1hdr->b_arc_access; - abi->abi_mru_hits = l1hdr->b_mru_hits; - abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = l1hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; - abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); - } - - if (l2hdr) { - abi->abi_l2arc_dattr = l2hdr->b_daddr; - abi->abi_l2arc_hits = l2hdr->b_hits; - } - - abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = arc_buf_type(hdr); - abi->abi_size = arc_hdr_size(hdr); -} - -/* - * Move the supplied buffer to the indicated state. The hash lock - * for the buffer must be held by the caller. - */ -static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) -{ - arc_state_t *old_state; - int64_t refcnt; - uint32_t bufcnt; - boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); - - /* - * We almost always have an L1 hdr here, since we call arc_hdr_realloc() - * in arc_read() when bringing a buffer out of the L2ARC. However, the - * L1 hdr doesn't always exist when we change state to arc_anon before - * destroying a header, in which case reallocating to add the L1 hdr is - * pointless. - */ - if (HDR_HAS_L1HDR(hdr)) { - old_state = hdr->b_l1hdr.b_state; - refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); - } else { - old_state = arc_l2c_only; - refcnt = 0; - bufcnt = 0; - update_old = B_FALSE; - } - update_new = update_old; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT3P(new_state, !=, old_state); - ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); - ASSERT(old_state != arc_anon || bufcnt <= 1); - - /* - * If this buffer is evictable, transfer it from the - * old state list to the new state list. - */ - if (refcnt == 0) { - if (old_state != arc_anon && old_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(old_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_old = B_TRUE; - } - arc_evictable_space_decrement(hdr, old_state); - } - if (new_state != arc_anon && new_state != arc_l2c_only) { - - /* - * An L1 header always exists here, since if we're - * moving to some L1-cached state (i.e. not l2c_only or - * anonymous), we realloc the header to add an L1hdr - * beforehand. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(new_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_new = B_TRUE; - } - arc_evictable_space_increment(hdr, new_state); - } - } - - ASSERT(!HDR_EMPTY(hdr)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) - buf_hash_remove(hdr); - - /* adjust state sizes (ignore arc_l2c_only) */ - - if (update_new && new_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - - /* - * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. - */ - (void) zfs_refcount_add_many(&new_state->arcs_size, - HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - } else { - uint32_t buffers = 0; - - /* - * Each individual buffer holds a unique reference, - * thus we must remove each of these references one - * at a time. - */ - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; - - /* - * When the arc_buf_t is sharing the data - * block with the hdr, the owner of the - * reference belongs to the hdr. Only - * add to the refcount if the arc_buf_t is - * not shared. - */ - if (arc_buf_is_shared(buf)) - continue; - - (void) zfs_refcount_add_many( - &new_state->arcs_size, - arc_buf_size(buf), buf); - } - ASSERT3U(bufcnt, ==, buffers); - - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_add_many( - &new_state->arcs_size, - arc_hdr_size(hdr), hdr); - } else { - ASSERT(GHOST_STATE(old_state)); - } - } - } - - if (update_old && old_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - - /* - * When moving a header off of a ghost state, - * the header will not contain any arc buffers. - * We use the arc header pointer for the reference - * which is exactly what we did when we put the - * header on the ghost state. - */ - - (void) zfs_refcount_remove_many(&old_state->arcs_size, - HDR_GET_LSIZE(hdr), hdr); - } else { - uint32_t buffers = 0; - - /* - * Each individual buffer holds a unique reference, - * thus we must remove each of these references one - * at a time. - */ - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; - - /* - * When the arc_buf_t is sharing the data - * block with the hdr, the owner of the - * reference belongs to the hdr. Only - * add to the refcount if the arc_buf_t is - * not shared. - */ - if (arc_buf_is_shared(buf)) - continue; - - (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); - } - ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), hdr); - } - } - - if (HDR_HAS_L1HDR(hdr)) - hdr->b_l1hdr.b_state = new_state; - - /* - * L2 headers should never be on the L2 state list since they don't - * have L1 headers allocated. - */ - ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); -} - -void -arc_space_consume(uint64_t space, arc_space_type_t type) -{ - ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); - - switch (type) { - case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, space); - break; - case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, space); - break; - case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, space); - break; - case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, space); - break; - case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, space); - break; - case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, space); - break; - case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, space); - break; - } - - if (type != ARC_SPACE_DATA) - aggsum_add(&arc_meta_used, space); - - aggsum_add(&arc_size, space); -} - -void -arc_space_return(uint64_t space, arc_space_type_t type) -{ - ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); - - switch (type) { - case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, -space); - break; - case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, -space); - break; - case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, -space); - break; - case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, -space); - break; - case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, -space); - break; - case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, -space); - break; - case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, -space); - break; - } - - if (type != ARC_SPACE_DATA) { - ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); - /* - * We use the upper bound here rather than the precise value - * because the arc_meta_max value doesn't need to be - * precise. It's only consumed by humans via arcstats. - */ - if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) - arc_meta_max = aggsum_upper_bound(&arc_meta_used); - aggsum_add(&arc_meta_used, -space); - } - - ASSERT(aggsum_compare(&arc_size, space) >= 0); - aggsum_add(&arc_size, -space); -} - -/* - * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pabd. - */ -static boolean_t -arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - /* - * The criteria for sharing a hdr's data are: - * 1. the hdr's compression matches the buf's compression - * 2. the hdr doesn't need to be byteswapped - * 3. the hdr isn't already being shared - * 4. the buf is either compressed or it is the last buf in the hdr list - * - * Criterion #4 maintains the invariant that shared uncompressed - * bufs must be the final buf in the hdr's b_buf list. Reading this, you - * might ask, "if a compressed buf is allocated first, won't that be the - * last thing in the list?", but in that case it's impossible to create - * a shared uncompressed buf anyway (because the hdr must be compressed - * to have the compressed buf). You might also think that #3 is - * sufficient to make this guarantee, however it's possible - * (specifically in the rare L2ARC write race mentioned in - * arc_buf_alloc_impl()) there will be an existing uncompressed buf that - * is sharable, but wasn't at the time of its allocation. Rather than - * allow a new shared uncompressed buf to be created and then shuffle - * the list around to make it the last element, this simply disallows - * sharing if the new buf isn't the first to be added. - */ - ASSERT3P(buf->b_hdr, ==, hdr); - boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; - boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; - return (buf_compressed == hdr_compressed && - hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - !HDR_SHARED_DATA(hdr) && - (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); -} - -/* - * Allocate a buf for this hdr. If you care about the data that's in the hdr, - * or if you want a compressed buffer, pass those flags in. Returns 0 if the - * copy was made successfully, or an error code otherwise. - */ -static int -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, - boolean_t fill, arc_buf_t **ret) -{ - arc_buf_t *buf; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); - VERIFY(hdr->b_type == ARC_BUFC_DATA || - hdr->b_type == ARC_BUFC_METADATA); - ASSERT3P(ret, !=, NULL); - ASSERT3P(*ret, ==, NULL); - - buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - buf->b_flags = 0; - - add_reference(hdr, tag); - - /* - * We're about to change the hdr's b_flags. We must either - * hold the hash_lock or be undiscoverable. - */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Only honor requests for compressed bufs if the hdr is actually - * compressed. - */ - if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; - - /* - * If the hdr's data can be shared then we share the data buffer and - * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new - * buffer to store the buf's data. - * - * There are two additional restrictions here because we're sharing - * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be - * actively involved in an L2ARC write, because if this buf is used by - * an arc_write() then the hdr's data buffer will be released when the - * write completes, even though the L2ARC write might still be using it. - * Second, the hdr's ABD must be linear so that the buf's user doesn't - * need to be ABD-aware. - */ - boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - abd_is_linear(hdr->b_l1hdr.b_pabd); - - /* Set up b_data and sharing */ - if (can_share) { - buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); - buf->b_flags |= ARC_BUF_FLAG_SHARED; - arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - } else { - buf->b_data = - arc_get_data_buf(hdr, arc_buf_size(buf), buf); - ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); - } - VERIFY3P(buf->b_data, !=, NULL); - - hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - - /* - * If the user wants the data from the hdr, we need to either copy or - * decompress the data. - */ - if (fill) { - return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); - } - - return (0); -} - -static char *arc_onloan_tag = "onloan"; - -static inline void -arc_loaned_bytes_update(int64_t delta) -{ - atomic_add_64(&arc_loaned_bytes, delta); - - /* assert that it did not wrap around */ - ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); -} - -/* - * Loan out an anonymous arc buffer. Loaned buffers are not counted as in - * flight data by arc_tempreserve_space() until they are "returned". Loaned - * buffers must be returned to the arc before they can be used by the DMU or - * freed. - */ -arc_buf_t * -arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) -{ - arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, - is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); - - arc_loaned_bytes_update(arc_buf_size(buf)); - - return (buf); -} - -arc_buf_t * -arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) -{ - arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, - psize, lsize, compression_type); - - arc_loaned_bytes_update(arc_buf_size(buf)); - - return (buf); -} - - -/* - * Return a loaned arc buffer to the arc. - */ -void -arc_return_buf(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(HDR_HAS_L1HDR(hdr)); - (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); - (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - - arc_loaned_bytes_update(-arc_buf_size(buf)); -} - -/* Detach an arc_buf from a dbuf (tag) */ -void -arc_loan_inuse_buf(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(HDR_HAS_L1HDR(hdr)); - (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - - arc_loaned_bytes_update(arc_buf_size(buf)); -} - -static void -l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) -{ - l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - - df->l2df_abd = abd; - df->l2df_size = size; - df->l2df_type = type; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); -} - -static void -arc_hdr_free_on_write(arc_buf_hdr_t *hdr) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t size = arc_hdr_size(hdr); - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - size, hdr); - } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); - if (type == ARC_BUFC_METADATA) { - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_space_return(size, ARC_SPACE_DATA); - } - - l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); -} - -/* - * Share the arc_buf_t's data with the hdr. Whenever we are sharing the - * data buffer, we transfer the refcount ownership to the hdr and update - * the appropriate kstats. - */ -static void -arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Start sharing the data buffer. We transfer the - * refcount ownership to the hdr since it always owns - * the refcount whenever an arc_buf_t is shared. - */ - zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); - abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, - HDR_ISTYPE_METADATA(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - buf->b_flags |= ARC_BUF_FLAG_SHARED; - - /* - * Since we've transferred ownership to the hdr we need - * to increment its compressed and uncompressed kstats and - * decrement the overhead size. - */ - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); -} - -static void -arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * We are no longer sharing this buffer so we need - * to transfer its ownership to the rightful owner. - */ - zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); - abd_put(hdr->b_l1hdr.b_pabd); - hdr->b_l1hdr.b_pabd = NULL; - buf->b_flags &= ~ARC_BUF_FLAG_SHARED; - - /* - * Since the buffer is no longer shared between - * the arc buf and the hdr, count it as overhead. - */ - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); -} - -/* - * Remove an arc_buf_t from the hdr's buf list and return the last - * arc_buf_t on the list. If no buffers remain on the list then return - * NULL. - */ -static arc_buf_t * -arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; - arc_buf_t *lastbuf = NULL; - - /* - * Remove the buf from the hdr list and locate the last - * remaining buffer on the list. - */ - while (*bufp != NULL) { - if (*bufp == buf) - *bufp = buf->b_next; - - /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. - */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); - IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); - - return (lastbuf); -} - -/* - * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's - * list and free it. - */ -static void -arc_buf_destroy_impl(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * Free up the data associated with the buf but only if we're not - * sharing this with the hdr. If we are sharing it with the hdr, the - * hdr is responsible for doing the free. - */ - if (buf->b_data != NULL) { - /* - * We're about to change the hdr's b_flags. We must either - * hold the hash_lock or be undiscoverable. - */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - arc_cksum_verify(buf); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - - if (arc_buf_is_shared(buf)) { - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - } else { - uint64_t size = arc_buf_size(buf); - arc_free_data_buf(hdr, buf->b_data, size, buf); - ARCSTAT_INCR(arcstat_overhead_size, -size); - } - buf->b_data = NULL; - - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - } - - arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); - - if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { - /* - * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pabd to share it with the new - * buffer at the end of the list. The shared buffer is always - * the last one on the hdr's buffer list. - * - * There is an equivalent case for compressed bufs, but since - * they aren't guaranteed to be the last buf in the list and - * that is an exceedingly rare case, we just allow that space be - * wasted temporarily. - */ - if (lastbuf != NULL) { - /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); - /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - arc_hdr_free_pabd(hdr); - - /* - * We must setup a new shared block between the - * last buffer and the hdr. The data would have - * been allocated by the arc buf so we need to transfer - * ownership to the hdr since it's now being shared. - */ - arc_share_buf(hdr, lastbuf); - } - } else if (HDR_SHARED_DATA(hdr)) { - /* - * Uncompressed shared buffers are always at the end - * of the list. Compressed buffers don't have the - * same requirements. This makes it hard to - * simply assert that the lastbuf is shared so - * we rely on the hdr's compression flags to determine - * if we have a compressed, shared buffer. - */ - ASSERT3P(lastbuf, !=, NULL); - ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - } - - /* - * Free the checksum if we're removing the last uncompressed buf from - * this hdr. - */ - if (!arc_hdr_has_uncompressed_buf(hdr)) { - arc_cksum_free(hdr); - } - - /* clean up the buf */ - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); -} - -static void -arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt) -{ - ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); - - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt); - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); -} - -static void -arc_hdr_free_pabd(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - - /* - * If the hdr is currently being written to the l2arc then - * we defer freeing the data by adding it to the l2arc_free_on_write - * list. The l2arc will free the data once it's finished - * writing it to the l2arc device. - */ - if (HDR_L2_WRITING(hdr)) { - arc_hdr_free_on_write(hdr); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, - arc_hdr_size(hdr), hdr); - } - hdr->b_l1hdr.b_pabd = NULL; - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); -} - -static arc_buf_hdr_t * -arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compression_type, arc_buf_contents_t type) -{ - arc_buf_hdr_t *hdr; - - VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(HDR_EMPTY(hdr)); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); - HDR_SET_PSIZE(hdr, psize); - HDR_SET_LSIZE(hdr, lsize); - hdr->b_spa = spa; - hdr->b_type = type; - hdr->b_flags = 0; - arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); - arc_hdr_set_compress(hdr, compression_type); - - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_bufcnt = 0; - hdr->b_l1hdr.b_buf = NULL; - - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_pabd(hdr, B_TRUE); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - - return (hdr); -} - -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); - } else { - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pabd field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); - -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif - - arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); - } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), - hdr); - (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), - nhdr); - - buf_discard_identity(hdr); - kmem_cache_free(old, hdr); - - return (nhdr); -} - -/* - * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. - * The buf is returned thawed since we expect the consumer to modify it. - */ -arc_buf_t * -arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) -{ - arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - ZIO_COMPRESS_OFF, type); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - - arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); - arc_buf_thaw(buf); - - return (buf); -} - -/* - * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this - * for bufs containing metadata. - */ -arc_buf_t * -arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) -{ - ASSERT3U(lsize, >, 0); - ASSERT3U(lsize, >=, psize); - ASSERT(compression_type > ZIO_COMPRESS_OFF); - ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); - - arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - compression_type, ARC_BUFC_DATA); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - - arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); - arc_buf_thaw(buf); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_decompress() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); - arc_hdr_free_pabd(hdr); - arc_share_buf(hdr, buf); - } - - return (buf); -} - -static void -arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) -{ - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; - uint64_t psize = arc_hdr_size(hdr); - - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - ASSERT(HDR_HAS_L2HDR(hdr)); - - list_remove(&dev->l2ad_buflist, hdr); - - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - - vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); - - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); -} - -static void -arc_hdr_destroy(arc_buf_hdr_t *hdr) -{ - if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - } - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - - if (!HDR_EMPTY(hdr)) - buf_discard_identity(hdr); - - if (HDR_HAS_L2HDR(hdr)) { - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); - - if (!buflist_held) - mutex_enter(&dev->l2ad_mtx); - - /* - * Even though we checked this conditional above, we - * need to check this again now that we have the - * l2ad_mtx. This is because we could be racing with - * another thread calling l2arc_evict() which might have - * destroyed this header's L2 portion as we were waiting - * to acquire the l2ad_mtx. If that happens, we don't - * want to re-destroy the header's L2 portion. - */ - if (HDR_HAS_L2HDR(hdr)) { - l2arc_trim(hdr); - arc_hdr_l2hdr_destroy(hdr); - } - - if (!buflist_held) - mutex_exit(&dev->l2ad_mtx); - } - - if (HDR_HAS_L1HDR(hdr)) { - arc_cksum_free(hdr); - - while (hdr->b_l1hdr.b_buf != NULL) - arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif - - if (hdr->b_l1hdr.b_pabd != NULL) { - arc_hdr_free_pabd(hdr); - } - } - - ASSERT3P(hdr->b_hash_next, ==, NULL); - if (HDR_HAS_L1HDR(hdr)) { - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_l2only_cache, hdr); - } -} - -void -arc_buf_destroy(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - - if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); - arc_hdr_destroy(hdr); - return; - } - - mutex_enter(hash_lock); - ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); - ASSERT3P(buf->b_data, !=, NULL); - - (void) remove_reference(hdr, hash_lock, tag); - arc_buf_destroy_impl(buf); - mutex_exit(hash_lock); -} - -/* - * Evict the arc_buf_hdr that is provided as a parameter. The resultant - * state of the header is dependent on its state prior to entering this - * function. The following transitions are possible: - * - * - arc_mru -> arc_mru_ghost - * - arc_mfu -> arc_mfu_ghost - * - arc_mru_ghost -> arc_l2c_only - * - arc_mru_ghost -> deleted - * - arc_mfu_ghost -> arc_l2c_only - * - arc_mfu_ghost -> deleted - */ -static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) -{ - arc_state_t *evicted_state, *state; - int64_t bytes_evicted = 0; - int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? - zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(HDR_HAS_L1HDR(hdr)); - - state = hdr->b_l1hdr.b_state; - if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - - /* - * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pabd field) during it's write phase. - * Thus, we cannot push a header onto the arc_l2c_only - * state (removing it's L1 piece) until the header is - * done being written to the l2arc. - */ - if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { - ARCSTAT_BUMP(arcstat_evict_l2_skip); - return (bytes_evicted); - } - - ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += HDR_GET_LSIZE(hdr); - - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - if (HDR_HAS_L2HDR(hdr)) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, hdr, hash_lock); - /* - * dropping from L1+L2 cached to L2-only, - * realloc to remove the L1 header. - */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, - hdr_l2only_cache); - } else { - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - } - return (bytes_evicted); - } - - ASSERT(state == arc_mru || state == arc_mfu); - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { - ARCSTAT_BUMP(arcstat_evict_skip); - return (bytes_evicted); - } - - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - ARCSTAT_BUMP(arcstat_mutex_miss); - break; - } - if (buf->b_data != NULL) - bytes_evicted += HDR_GET_LSIZE(hdr); - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf); - } - - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); - } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - HDR_GET_LSIZE(hdr)); - } else { - ARCSTAT_INCR(arcstat_evict_l2_ineligible, - HDR_GET_LSIZE(hdr)); - } - } - - if (hdr->b_l1hdr.b_bufcnt == 0) { - arc_cksum_free(hdr); - - bytes_evicted += arc_hdr_size(hdr); - - /* - * If this hdr is being evicted and has a compressed - * buffer then we discard it here before we change states. - * This ensures that the accounting is updated correctly - * in arc_free_data_impl(). - */ - arc_hdr_free_pabd(hdr); - - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); - } - - return (bytes_evicted); -} - -static uint64_t -arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, int64_t bytes) -{ - multilist_sublist_t *mls; - uint64_t bytes_evicted = 0; - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - int evict_count = 0; - - ASSERT3P(marker, !=, NULL); - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - - mls = multilist_sublist_lock(ml, idx); - - for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; - hdr = multilist_sublist_prev(mls, marker)) { - if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || - (evict_count >= zfs_arc_evict_batch_limit)) - break; - - /* - * To keep our iteration location, move the marker - * forward. Since we're not holding hdr's hash lock, we - * must be very careful and not remove 'hdr' from the - * sublist. Otherwise, other consumers might mistake the - * 'hdr' as not being on a sublist when they call the - * multilist_link_active() function (they all rely on - * the hash lock protecting concurrent insertions and - * removals). multilist_sublist_move_forward() was - * specifically implemented to ensure this is the case - * (only 'marker' will be removed and re-inserted). - */ - multilist_sublist_move_forward(mls, marker); - - /* - * The only case where the b_spa field should ever be - * zero, is the marker headers inserted by - * arc_evict_state(). It's possible for multiple threads - * to be calling arc_evict_state() concurrently (e.g. - * dsl_pool_close() and zio_inject_fault()), so we must - * skip any markers we see from these other threads. - */ - if (hdr->b_spa == 0) - continue; - - /* we're only interested in evicting buffers of a certain spa */ - if (spa != 0 && hdr->b_spa != spa) { - ARCSTAT_BUMP(arcstat_evict_skip); - continue; - } - - hash_lock = HDR_LOCK(hdr); - - /* - * We aren't calling this function from any code path - * that would already be holding a hash lock, so we're - * asserting on this assumption to be defensive in case - * this ever changes. Without this check, it would be - * possible to incorrectly increment arcstat_mutex_miss - * below (e.g. if the code changed such that we called - * this function with a hash lock held). - */ - ASSERT(!MUTEX_HELD(hash_lock)); - - if (mutex_tryenter(hash_lock)) { - uint64_t evicted = arc_evict_hdr(hdr, hash_lock); - mutex_exit(hash_lock); - - bytes_evicted += evicted; - - /* - * If evicted is zero, arc_evict_hdr() must have - * decided to skip this header, don't increment - * evict_count in this case. - */ - if (evicted != 0) - evict_count++; - - /* - * If arc_size isn't overflowing, signal any - * threads that might happen to be waiting. - * - * For each header evicted, we wake up a single - * thread. If we used cv_broadcast, we could - * wake up "too many" threads causing arc_size - * to significantly overflow arc_c; since - * arc_get_data_impl() doesn't check for overflow - * when it's woken up (it doesn't because it's - * possible for the ARC to be overflowing while - * full of un-evictable buffers, and the - * function should proceed in this case). - * - * If threads are left sleeping, due to not - * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_adjust_cb() just - * before arc_adjust_zthr sleeps. - */ - mutex_enter(&arc_adjust_lock); - if (!arc_is_overflowing()) - cv_signal(&arc_adjust_waiters_cv); - mutex_exit(&arc_adjust_lock); - } else { - ARCSTAT_BUMP(arcstat_mutex_miss); - } - } - - multilist_sublist_unlock(mls); - - return (bytes_evicted); -} - -/* - * Evict buffers from the given arc state, until we've removed the - * specified number of bytes. Move the removed buffers to the - * appropriate evict state. - * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so, may not catch all candidates. - * It may also return without evicting as much space as requested. - * - * If bytes is specified using the special value ARC_EVICT_ALL, this - * will evict all available (i.e. unlocked and evictable) buffers from - * the given arc state; which is used by arc_flush(). - */ -static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) -{ - uint64_t total_evicted = 0; - multilist_t *ml = state->arcs_list[type]; - int num_sublists; - arc_buf_hdr_t **markers; - - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - - num_sublists = multilist_get_num_sublists(ml); - - /* - * If we've tried to evict from each sublist, made some - * progress, but still have not hit the target number of bytes - * to evict, we want to keep trying. The markers allow us to - * pick up where we left off for each individual sublist, rather - * than starting from the tail each time. - */ - markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); - for (int i = 0; i < num_sublists; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_adjust_type() and - * arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); - multilist_sublist_insert_tail(mls, markers[i]); - multilist_sublist_unlock(mls); - } - - /* - * While we haven't hit our target number of bytes to evict, or - * we're evicting all available buffers. - */ - while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { - int sublist_idx = multilist_get_random_index(ml); - uint64_t scan_evicted = 0; - - /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, - arc_dnode_limit) > 0) { - arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - - arc_dnode_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - - /* - * Start eviction using a randomly selected sublist, - * this is to try and evenly balance eviction across all - * sublists. Always starting at the same sublist - * (e.g. index 0) would cause evictions to favor certain - * sublists over others. - */ - for (int i = 0; i < num_sublists; i++) { - uint64_t bytes_remaining; - uint64_t bytes_evicted; - - if (bytes == ARC_EVICT_ALL) - bytes_remaining = ARC_EVICT_ALL; - else if (total_evicted < bytes) - bytes_remaining = bytes - total_evicted; - else - break; - - bytes_evicted = arc_evict_state_impl(ml, sublist_idx, - markers[sublist_idx], spa, bytes_remaining); - - scan_evicted += bytes_evicted; - total_evicted += bytes_evicted; - - /* we've reached the end, wrap to the beginning */ - if (++sublist_idx >= num_sublists) - sublist_idx = 0; - } - - /* - * If we didn't evict anything during this scan, we have - * no reason to believe we'll evict more during another - * scan, so break the loop. - */ - if (scan_evicted == 0) { - /* This isn't possible, let's make that obvious */ - ASSERT3S(bytes, !=, 0); - - /* - * When bytes is ARC_EVICT_ALL, the only way to - * break the loop is when scan_evicted is zero. - * In that case, we actually have evicted enough, - * so we don't want to increment the kstat. - */ - if (bytes != ARC_EVICT_ALL) { - ASSERT3S(total_evicted, <, bytes); - ARCSTAT_BUMP(arcstat_evict_not_enough); - } - - break; - } - } - - for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); - multilist_sublist_remove(mls, markers[i]); - multilist_sublist_unlock(mls); - - kmem_cache_free(hdr_full_cache, markers[i]); - } - kmem_free(markers, sizeof (*markers) * num_sublists); - - return (total_evicted); -} - -/* - * Flush all "evictable" data of the given type from the arc state - * specified. This will not evict any "active" buffers (i.e. referenced). - * - * When 'retry' is set to B_FALSE, the function will make a single pass - * over the state and evict any buffers that it can. Since it doesn't - * continually retry the eviction, it might end up leaving some buffers - * in the ARC due to lock misses. - * - * When 'retry' is set to B_TRUE, the function will continually retry the - * eviction until *all* evictable buffers have been removed from the - * state. As a result, if concurrent insertions into the state are - * allowed (e.g. if the ARC isn't shutting down), this function might - * wind up in an infinite loop, continually trying to evict buffers. - */ -static uint64_t -arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, - boolean_t retry) -{ - uint64_t evicted = 0; - - while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); - - if (!retry) - break; - } - - return (evicted); -} - -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -static void -arc_prune_async(int64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - -/* - * Evict the specified number of bytes from the state specified, - * restricting eviction to the spa and type given. This function - * prevents us from trying to evict more from a state's list than - * is "evictable", and to skip evicting altogether when passed a - * negative value for "bytes". In contrast, arc_evict_state() will - * evict everything it can, when passed a negative value for "bytes". - */ -static uint64_t -arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) -{ - int64_t delta; - - if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), - bytes); - return (arc_evict_state(state, spa, delta, type)); - } - - return (0); -} - -/* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. - */ -static uint64_t -arc_adjust_meta_balanced(uint64_t meta_used) -{ - int64_t delta, prune = 0, adjustmnt; - uint64_t total_evicted = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; - } - - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); - } - - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); - } - - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arc_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_adjust_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -static uint64_t -arc_adjust_meta(uint64_t meta_used) -{ - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_adjust_meta_only(meta_used)); - else - return (arc_adjust_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_adjust_type(arc_state_t *state) -{ - multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; - - /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). - */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; - } - - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); - - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } - - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); - - return (type); -} - -/* - * Evict buffers from the cache, such that arc_size is capped by arc_c. - */ -static uint64_t -arc_adjust(void) -{ - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_size); - uint64_t ameta = aggsum_value(&arc_meta_used); - - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_adjust_meta(ameta); - - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); - - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } - - /* - * Re-sum ARC stats after the first round of evictions. - */ - asize = aggsum_value(&arc_size); - ameta = aggsum_value(&arc_meta_used); - - /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. - */ - target = asize - arc_c; - - if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - } - - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - - bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - - /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: - * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c - */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - - bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -void -arc_flush(spa_t *spa, boolean_t retry) -{ - uint64_t guid = 0; - - /* - * If retry is B_TRUE, a spa must not be specified since we have - * no good way to determine if all of a spa's buffers have been - * evicted from an arc state. - */ - ASSERT(!retry || spa == 0); - - if (spa != NULL) - guid = spa_load_guid(spa); - - (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); -} - -static void -arc_reduce_target_size(int64_t to_free) -{ - uint64_t asize = aggsum_value(&arc_size); - if (arc_c > arc_c_min) { - DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, - arc_c_min, uint64_t, arc_p, uint64_t, to_free); - if (arc_c > arc_c_min + to_free) - atomic_add_64(&arc_c, -to_free); - else - arc_c = arc_c_min; - - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (asize < arc_c) - arc_c = MAX(asize, arc_c_min); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - - DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, - arc_p); - - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } - - if (asize > arc_c) { - DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, - uint64_t, arc_c); - /* See comment in arc_adjust_cb_check() on why lock+flag */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - } -} - -typedef enum free_memory_reason_t { - FMR_UNKNOWN, - FMR_NEEDFREE, - FMR_LOTSFREE, - FMR_SWAPFS_MINFREE, - FMR_PAGES_PP_MAXIMUM, - FMR_HEAP_ARENA, - FMR_ZIO_ARENA, -} free_memory_reason_t; - -int64_t last_free_memory; -free_memory_reason_t last_free_reason; - -/* - * Additional reserve of pages for pp_reserve. - */ -int64_t arc_pages_pp_reserve = 64; - -/* - * Additional reserve of pages for swapfs. - */ -int64_t arc_swapfs_reserve = 64; - -/* - * Return the amount of memory that can be consumed before reclaim will be - * needed. Positive if there is sufficient free memory, negative indicates - * the amount of memory that needs to be freed up. - */ -static int64_t -arc_available_memory(void) -{ - int64_t lowest = INT64_MAX; - int64_t n; - free_memory_reason_t r = FMR_UNKNOWN; - -#ifdef _KERNEL -#ifdef __FreeBSD__ - /* - * Cooperate with pagedaemon when it's time for it to scan - * and reclaim some pages. - */ - n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - -#else - if (needfree > 0) { - n = PAGESIZE * (-needfree); - if (n < lowest) { - lowest = n; - r = FMR_NEEDFREE; - } - } - - /* - * check that we're out of range of the pageout scanner. It starts to - * schedule paging if freemem is less than lotsfree and needfree. - * lotsfree is the high-water mark for pageout, and needfree is the - * number of needed free pages. We add extra pages here to make sure - * the scanner doesn't start up while we're freeing memory. - */ - n = PAGESIZE * (freemem - lotsfree - needfree - desfree); - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - - desfree - arc_swapfs_reserve); - if (n < lowest) { - lowest = n; - r = FMR_SWAPFS_MINFREE; - } - - - /* - * Check that we have enough availrmem that memory locking (e.g., via - * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum - * stores the number of pages that cannot be locked; when availrmem - * drops below pages_pp_maximum, page locking mechanisms such as - * page_pp_lock() will fail.) - */ - n = PAGESIZE * (availrmem - pages_pp_maximum - - arc_pages_pp_reserve); - if (n < lowest) { - lowest = n; - r = FMR_PAGES_PP_MAXIMUM; - } - -#endif /* __FreeBSD__ */ -#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - n = uma_avail() - (long)(uma_limit() / 4); - if (n < lowest) { - lowest = n; - r = FMR_HEAP_ARENA; - } -#endif - - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this arena remains - * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. - * - * Note that reducing the arc_zio_arena_free_shift keeps more virtual - * memory (in the zio_arena) free, which can avoid memory - * fragmentation issues. - */ - if (zio_arena != NULL) { - n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - - (vmem_size(zio_arena, VMEM_ALLOC) >> - arc_zio_arena_free_shift); - if (n < lowest) { - lowest = n; - r = FMR_ZIO_ARENA; - } - } - -#else /* _KERNEL */ - /* Every 100 calls, free a small amount */ - if (spa_get_random(100) == 0) - lowest = -1024; -#endif /* _KERNEL */ - - last_free_memory = lowest; - last_free_reason = r; - DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); - return (lowest); -} - - -/* - * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of B_TRUE indicates that the system - * is under memory pressure and that the arc should adjust accordingly. - */ -static boolean_t -arc_reclaim_needed(void) -{ - return (arc_available_memory() < 0); -} - -extern kmem_cache_t *zio_buf_cache[]; -extern kmem_cache_t *zio_data_buf_cache[]; -extern kmem_cache_t *range_seg_cache; -extern kmem_cache_t *abd_chunk_cache; - -static __noinline void -arc_kmem_reap_soon(void) -{ - size_t i; - kmem_cache_t *prev_cache = NULL; - kmem_cache_t *prev_data_cache = NULL; - - DTRACE_PROBE(arc__kmem_reap_start); -#ifdef _KERNEL - if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { - /* - * We are exceeding our meta-data cache limit. - * Purge some DNLC entries to release holds on meta-data. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif - - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { - if (zio_buf_cache[i] != prev_cache) { - prev_cache = zio_buf_cache[i]; - kmem_cache_reap_soon(zio_buf_cache[i]); - } - if (zio_data_buf_cache[i] != prev_data_cache) { - prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_soon(zio_data_buf_cache[i]); - } - } - kmem_cache_reap_soon(abd_chunk_cache); - kmem_cache_reap_soon(buf_cache); - kmem_cache_reap_soon(hdr_full_cache); - kmem_cache_reap_soon(hdr_l2only_cache); - kmem_cache_reap_soon(range_seg_cache); - -#ifdef illumos - if (zio_arena != NULL) { - /* - * Ask the vmem arena to reclaim unused memory from its - * quantum caches. - */ - vmem_qcache_reap(zio_arena); - } -#endif - DTRACE_PROBE(arc__kmem_reap_end); -} - -/* ARGSUSED */ -static boolean_t -arc_adjust_cb_check(void *arg, zthr_t *zthr) -{ - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date(the arc_adjust_zthr has a maximum sleep - * time of 1 second); but that should suffice. The - * arc_state_t structures can be queried directly if more - * accurate information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - /* - * We have to rely on arc_get_data_impl() to tell us when to adjust, - * rather than checking if we are overflowing here, so that we are - * sure to not leave arc_get_data_impl() waiting on - * arc_adjust_waiters_cv. If we have become "not overflowing" since - * arc_get_data_impl() checked, we need to wake it up. We could - * broadcast the CV here, but arc_get_data_impl() may have not yet - * gone to sleep. We would need to use a mutex to ensure that this - * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_adjust_lock). However, the lock ordering of - * such a lock would necessarily be incorrect with respect to the - * zthr_lock, which is held before this function is called, and is - * held by arc_get_data_impl() when it calls zthr_wakeup(). - */ - return (arc_adjust_needed); -} - -/* - * Keep arc_size under arc_c by running arc_adjust which evicts data - * from the ARC. */ -/* ARGSUSED */ -static void -arc_adjust_cb(void *arg, zthr_t *zthr) -{ - uint64_t evicted = 0; - - /* Evict from cache */ - evicted = arc_adjust(); - - /* - * If evicted is zero, we couldn't evict anything - * via arc_adjust(). This could be due to hash lock - * collisions, but more likely due to the majority of - * arc buffers being unevictable. Therefore, even if - * arc_size is above arc_c, another pass is unlikely to - * be helpful and could potentially cause us to enter an - * infinite loop. Additionally, zthr_iscancelled() is - * checked here so that if the arc is shutting down, the - * broadcast will wake any remaining arc adjust waiters. - */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && - evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; - if (!arc_adjust_needed) { - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any waiters. - */ - cv_broadcast(&arc_adjust_waiters_cv); - } - mutex_exit(&arc_adjust_lock); -} - -/* ARGSUSED */ -static boolean_t -arc_reap_cb_check(void *arg, zthr_t *zthr) -{ - int64_t free_memory = arc_available_memory(); - - /* - * If a kmem reap is already active, don't schedule more. We must - * check for this because kmem_cache_reap_soon() won't actually - * block on the cache being reaped (this is to prevent callers from - * becoming implicitly blocked by a system-wide kmem reap -- which, - * on a system with many, many full magazines, can take minutes). - */ - if (!kmem_cache_reap_active() && - free_memory < 0) { - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; - /* - * Wait at least zfs_grow_retry (default 60) seconds - * before considering growing. - */ - arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); - return (B_TRUE); - } else if (free_memory < arc_c >> arc_no_grow_shift) { - arc_no_grow = B_TRUE; - } else if (gethrtime() >= arc_growtime) { - arc_no_grow = B_FALSE; - } - - return (B_FALSE); -} - -/* - * Keep enough free memory in the system by reaping the ARC's kmem - * caches. To cause more slabs to be reapable, we may reduce the - * target size of the cache (arc_c), causing the arc_adjust_cb() - * to free more buffers. - */ -/* ARGSUSED */ -static void -arc_reap_cb(void *arg, zthr_t *zthr) -{ - int64_t free_memory; - - /* - * Kick off asynchronous kmem_reap()'s of all our caches. - */ - arc_kmem_reap_soon(); - - /* - * Wait at least arc_kmem_cache_reap_retry_ms between - * arc_kmem_reap_soon() calls. Without this check it is possible to - * end up in a situation where we spend lots of time reaping - * caches, while we're near arc_c_min. Waiting here also gives the - * subsequent free memory check a chance of finding that the - * asynchronous reap has already freed enough memory, and we don't - * need to call arc_reduce_target_size(). - */ - delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); - - /* - * Reduce the target size as needed to maintain the amount of free - * memory in the system at a fraction of the arc_size (1/128th by - * default). If oversubscribed (free_memory < 0) then reduce the - * target arc_size by the deficit amount plus the fractional - * amount. If free memory is positive but less then the fractional - * amount, reduce by what is needed to hit the fractional amount. - */ - free_memory = arc_available_memory(); - - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { -#ifdef _KERNEL -#ifdef illumos - to_free = MAX(to_free, ptob(needfree)); -#endif -#endif - arc_reduce_target_size(to_free); - } -} - -static u_int arc_dnlc_evicts_arg; -extern struct vfsops zfs_vfsops; - -static void -arc_dnlc_evicts_thread(void *dummy __unused) -{ - callb_cpr_t cpr; - u_int percent; - - CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_dnlc_evicts_lock); - while (!arc_dnlc_evicts_thread_exit) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); - CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); - if (arc_dnlc_evicts_arg != 0) { - percent = arc_dnlc_evicts_arg; - mutex_exit(&arc_dnlc_evicts_lock); -#ifdef _KERNEL - vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); -#endif - mutex_enter(&arc_dnlc_evicts_lock); - /* - * Clear our token only after vnlru_free() - * pass is done, to avoid false queueing of - * the requests. - */ - arc_dnlc_evicts_arg = 0; - } - } - arc_dnlc_evicts_thread_exit = FALSE; - cv_broadcast(&arc_dnlc_evicts_cv); - CALLB_CPR_EXIT(&cpr); - thread_exit(); -} - -void -dnlc_reduce_cache(void *arg) -{ - u_int percent; - - percent = (u_int)(uintptr_t)arg; - mutex_enter(&arc_dnlc_evicts_lock); - if (arc_dnlc_evicts_arg == 0) { - arc_dnlc_evicts_arg = percent; - cv_broadcast(&arc_dnlc_evicts_cv); - } - mutex_exit(&arc_dnlc_evicts_lock); -} - -/* - * Adapt arc info given the number of bytes we are trying to add and - * the state that we are comming from. This function is only called - * when we are adding new content to the cache. - */ -static void -arc_adapt(int bytes, arc_state_t *state) -{ - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - if (state == arc_l2c_only) - return; - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - - /* - * Wake reap thread if we do not have any available memory - */ - if (arc_reclaim_needed()) { - zthr_wakeup(arc_reap_zthr); - return; - } - - if (arc_no_grow) - return; - - if (arc_c >= arc_c_max) - return; - - /* - * If we're within (2 * maxblocksize) bytes of the target - * cache size, increment the target cache size - */ - if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > - 0) { - DTRACE_PROBE1(arc__inc_adapt, int, bytes); - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) - arc_c = arc_c_max; - else if (state == arc_anon) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; - } - ASSERT((int64_t)arc_p >= 0); -} - -/* - * Check if arc_size has grown past our upper threshold, determined by - * zfs_arc_overflow_shift. - */ -static boolean_t -arc_is_overflowing(void) -{ - /* Always allow at least one block of overflow */ - int64_t overflow = MAX(SPA_MAXBLOCKSIZE, - arc_c >> zfs_arc_overflow_shift); - - /* - * We just compare the lower bound here for performance reasons. Our - * primary goals are to make sure that the arc never grows without - * bound, and that it can reach its maximum size. This check - * accomplishes both goals. The maximum amount we could run over by is - * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block - * in the ARC. In practice, that's in the tens of MB, which is low - * enough to be safe. - */ - return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow); -} - -static abd_t * -arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_get_data_impl(hdr, size, tag, do_adapt); - if (type == ARC_BUFC_METADATA) { - return (abd_alloc(size, B_TRUE)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (abd_alloc(size, B_FALSE)); - } -} - -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_get_data_impl(hdr, size, tag, B_TRUE); - if (type == ARC_BUFC_METADATA) { - return (zio_buf_alloc(size)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (zio_data_buf_alloc(size)); - } -} - -/* - * Allocate a block and return it to the caller. If we are hitting the - * hard limit for the cache size, we must sleep, waiting for the eviction - * thread to catch up. If we're past the target size but below the hard - * limit, we'll only signal the reclaim thread and continue on. - */ -static void -arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (do_adapt) - arc_adapt(size, state); - - /* - * If arc_size is currently overflowing, and has grown past our - * upper limit, we must be adding data faster than the evict - * thread can evict. Thus, to ensure we don't compound the - * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the - * eviction thread to catch up. - * - * It's also possible that the reclaim thread is unable to evict - * enough buffers to get arc_size below the overflow limit (e.g. - * due to buffers being un-evictable, or hash lock collisions). - * In this case, we want to proceed regardless if we're - * overflowing; thus we don't use a while loop here. - */ - if (arc_is_overflowing()) { - mutex_enter(&arc_adjust_lock); - - /* - * Now that we've acquired the lock, we may no longer be - * over the overflow limit, lets check. - * - * We're ignoring the case of spurious wake ups. If that - * were to happen, it'd let this thread consume an ARC - * buffer before it should have (i.e. before we're under - * the overflow limit and were signalled by the reclaim - * thread). As long as that is a rare occurrence, it - * shouldn't cause any harm. - */ - if (arc_is_overflowing()) { - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - (void) cv_wait(&arc_adjust_waiters_cv, - &arc_adjust_lock); - } - mutex_exit(&arc_adjust_lock); - } - - VERIFY3U(hdr->b_type, ==, type); - if (type == ARC_BUFC_METADATA) { - arc_space_consume(size, ARC_SPACE_META); - } else { - arc_space_consume(size, ARC_SPACE_DATA); - } - - /* - * Update the state size. Note that ghost states have a - * "ghost size" and so don't need to be updated. - */ - if (!GHOST_STATE(state)) { - - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); - - /* - * If this is reached via arc_read, the link is - * protected by the hash lock. If reached via - * arc_buf_alloc, the header should not be accessed by - * any other thread. And, if reached via arc_read_done, - * the hash lock will protect it if it's found in the - * hash table; otherwise no other thread should be - * trying to [add|remove]_reference it. - */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) zfs_refcount_add_many(&state->arcs_esize[type], - size, tag); - } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) - arc_p = MIN(arc_c, arc_p + size); - } - ARCSTAT_BUMP(arcstat_allocated); -} - -static void -arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) -{ - arc_free_data_impl(hdr, size, tag); - abd_free(abd); -} - -static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_free_data_impl(hdr, size, tag); - if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf, size); - } else { - ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf, size); - } -} - -/* - * Free the arc data buffer. - */ -static void -arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - size, tag); - } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); - - VERIFY3U(hdr->b_type, ==, type); - if (type == ARC_BUFC_METADATA) { - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_space_return(size, ARC_SPACE_DATA); - } -} - -/* - * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. - */ -static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) -{ - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (hdr->b_l1hdr.b_state == arc_anon) { - /* - * This buffer is not in the cache, and does not - * appear in our "ghost" list. Add the new buffer - * to the MRU state. - */ - - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); - - } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); - - /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). - */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - ARCSTAT_BUMP(arcstat_mru_hits); - } - hdr->b_l1hdr.b_arc_access = now; - return; - } - - /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. - */ - if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ - hdr->b_l1hdr.b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); - } - atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); - } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { - arc_state_t *new_state; - /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - } - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - } else { - new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - - atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); - ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (hdr->b_l1hdr.b_state == arc_mfu) { - /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. - */ - - atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; - /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - - atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); - ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { - /* - * This buffer is on the 2nd Level ARC. - */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); - } else { - ASSERT(!"invalid arc state"); - } -} - -/* - * This routine is called by dbuf_hold() to update the arc_access() state - * which otherwise would be skipped for entries in the dbuf cache. - */ -void -arc_buf_access(arc_buf_t *buf) -{ - mutex_enter(&buf->b_evict_lock); - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * Avoid taking the hash_lock when possible as an optimization. - * The header must be checked again under the hash_lock in order - * to handle the case where it is concurrently being released. - */ - if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { - mutex_exit(&buf->b_evict_lock); - ARCSTAT_BUMP(arcstat_access_skip); - return; - } - - kmutex_t *hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { - mutex_exit(hash_lock); - mutex_exit(&buf->b_evict_lock); - ARCSTAT_BUMP(arcstat_access_skip); - return; - } - - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); -} - -/* a generic arc_read_done_func_t which you can use */ -/* ARGSUSED */ -void -arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *arg) -{ - if (buf == NULL) - return; - - bcopy(buf->b_data, arg, arc_buf_size(buf)); - arc_buf_destroy(buf, arg); -} - -/* a generic arc_read_done_func_t */ -/* ARGSUSED */ -void -arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *arg) -{ - arc_buf_t **bufp = arg; - if (buf == NULL) { - ASSERT(zio == NULL || zio->io_error != 0); - *bufp = NULL; - } else { - ASSERT(zio == NULL || zio->io_error == 0); - *bufp = buf; - ASSERT(buf->b_data != NULL); - } -} - -static void -arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) -{ - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { - ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else { - if (HDR_COMPRESSION_ENABLED(hdr)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, - BP_GET_COMPRESS(bp)); - } - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); - ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); - } -} - -static void -arc_read_done(zio_t *zio) -{ - arc_buf_hdr_t *hdr = zio->io_private; - kmutex_t *hash_lock = NULL; - arc_callback_t *callback_list; - arc_callback_t *acb; - boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); - - /* - * The hdr was inserted into hash-table and removed from lists - * prior to starting I/O. We should find this header, since - * it's in the hash table, and it should be legit since it's - * not possible to evict it during the I/O. The only possible - * reason for it not to be found is if we were freed during the - * read. - */ - if (HDR_IN_HASH_TABLE(hdr)) { - ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); - ASSERT3U(hdr->b_dva.dva_word[0], ==, - BP_IDENTITY(zio->io_bp)->dva_word[0]); - ASSERT3U(hdr->b_dva.dva_word[1], ==, - BP_IDENTITY(zio->io_bp)->dva_word[1]); - - arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, - &hash_lock); - - ASSERT((found == hdr && - DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || - (found == hdr && HDR_L2_READING(hdr))); - ASSERT3P(hash_lock, !=, NULL); - } - - if (no_zio_error) { - /* byteswap if necessary */ - if (BP_SHOULD_BYTESWAP(zio->io_bp)) { - if (BP_GET_LEVEL(zio->io_bp) > 0) { - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; - } else { - hdr->b_l1hdr.b_byteswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - } - } else { - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - } - } - - arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); - if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - - callback_list = hdr->b_l1hdr.b_acb; - ASSERT3P(callback_list, !=, NULL); - - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } - - /* - * If a read request has a callback (i.e. acb_done is not NULL), then we - * make a buf containing the data according to the parameters which were - * passed in. The implementation of arc_buf_alloc_impl() ensures that we - * aren't needlessly decompressing the data multiple times. - */ - int callback_cnt = 0; - for (acb = callback_list; acb != NULL; acb = acb->acb_next) { - if (!acb->acb_done) - continue; - - callback_cnt++; - - if (no_zio_error) { - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, zio->io_error == 0, - &acb->acb_buf); - if (error != 0) { - /* - * Decompression failed. Set io_error - * so that when we call acb_done (below), - * we will indicate that the read failed. - * Note that in the unusual case where one - * callback is compressed and another - * uncompressed, we will mark all of them - * as failed, even though the uncompressed - * one can't actually fail. In this case, - * the hdr will not be anonymous, because - * if there are multiple callbacks, it's - * because multiple threads found the same - * arc buf in the hash table. - */ - zio->io_error = error; - } - } - } - /* - * If there are multiple callbacks, we must have the hash lock, - * because the only way for multiple threads to find this hdr is - * in the hash table. This ensures that if there are multiple - * callbacks, the hdr is not anonymous. If it were anonymous, - * we couldn't use arc_buf_destroy() in the error case below. - */ - ASSERT(callback_cnt < 2 || hash_lock != NULL); - - hdr->b_l1hdr.b_acb = NULL; - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) { - ASSERT(HDR_PREFETCH(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - } - - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || - callback_list != NULL); - - if (no_zio_error) { - arc_hdr_verify(hdr, zio->io_bp); - } else { - arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); - if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - if (HDR_IN_HASH_TABLE(hdr)) - buf_hash_remove(hdr); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } - - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); - - if (hash_lock != NULL) { - mutex_exit(hash_lock); - } else { - /* - * This block was freed while we waited for the read to - * complete. It has been removed from the hash table and - * moved to the anonymous state (so that it won't show up - * in the cache). - */ - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } - - /* execute each callback and free its structure */ - while ((acb = callback_list) != NULL) { - if (acb->acb_done != NULL) { - if (zio->io_error != 0 && acb->acb_buf != NULL) { - /* - * If arc_buf_alloc_impl() fails during - * decompression, the buf will still be - * allocated, and needs to be freed here. - */ - arc_buf_destroy(acb->acb_buf, acb->acb_private); - acb->acb_buf = NULL; - } - acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, - acb->acb_buf, acb->acb_private); - } - - if (acb->acb_zio_dummy != NULL) { - acb->acb_zio_dummy->io_error = zio->io_error; - zio_nowait(acb->acb_zio_dummy); - } - - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); - } - - if (freeable) - arc_hdr_destroy(hdr); -} - -/* - * "Read" the block at the specified DVA (in bp) via the - * cache. If the block is found in the cache, invoke the provided - * callback immediately and return. Note that the `zio' parameter - * in the callback will be NULL in this case, since no IO was - * required. If the block is not in the cache pass the read request - * on to the spa with a substitute callback function, so that the - * requested block will be added to the cache. - * - * If a read request arrives for a block that has a read in-progress, - * either wait for the in-progress read to complete (and return the - * results); or, if this is a read with a "done" func, add a record - * to the read to invoke the "done" func when the read completes, - * and return; or just return. - * - * arc_read_done() will invoke all the requested "done" functions - * for readers of this block. - */ -int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, - arc_flags_t *arc_flags, const zbookmark_phys_t *zb) -{ - arc_buf_hdr_t *hdr = NULL; - kmutex_t *hash_lock = NULL; - zio_t *rzio; - uint64_t guid = spa_load_guid(spa); - boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; - int rc = 0; - - ASSERT(!BP_IS_EMBEDDED(bp) || - BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); - -top: - if (!BP_IS_EMBEDDED(bp)) { - /* - * Embedded BP's have no DVA and require no I/O to "read". - * Create an anonymous arc buf to back it. - */ - hdr = buf_hash_find(guid, bp, &hash_lock); - } - - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { - arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; - - if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - - ASSERT3P(head_zio, !=, NULL); - if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && - priority == ZIO_PRIORITY_SYNC_READ) { - /* - * This is a sync read that needs to wait for - * an in-flight async read. Request that the - * zio have its priority upgraded. - */ - zio_change_priority(head_zio, priority); - DTRACE_PROBE1(arc__async__upgrade__sync, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_async_upgrade_sync); - } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - - acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_compressed = compressed_read; - if (pio != NULL) - acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); - acb->acb_zio_head = head_zio; - acb->acb_next = hdr->b_l1hdr.b_acb; - hdr->b_l1hdr.b_acb = acb; - mutex_exit(hash_lock); - return (0); - } - mutex_exit(hash_lock); - return (0); - } - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - if (done) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } - - ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); - /* Get a buf with the desired data in it. */ - rc = arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf); - if (rc != 0) { - arc_buf_destroy(buf, private); - buf = NULL; - } - ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || - rc == 0 || rc != ENOENT); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); - - if (done) - done(NULL, zb, bp, buf, private); - } else { - uint64_t lsize = BP_GET_LSIZE(bp); - uint64_t psize = BP_GET_PSIZE(bp); - arc_callback_t *acb; - vdev_t *vd = NULL; - uint64_t addr = 0; - boolean_t devw = B_FALSE; - uint64_t size; - - if (hdr == NULL) { - /* this block is not in the cache */ - arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_GET_COMPRESS(bp), type); - - if (!BP_IS_EMBEDDED(bp)) { - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - exists = buf_hash_insert(hdr, &hash_lock); - } - if (exists != NULL) { - /* somebody beat us to the hash insert */ - mutex_exit(hash_lock); - buf_discard_identity(hdr); - arc_hdr_destroy(hdr); - goto top; /* restart the IO request */ - } - } else { - /* - * This block is in the ghost cache. If it was L2-only - * (and thus didn't have an L1 hdr), we realloc the - * header to add an L1 hdr. - */ - if (!HDR_HAS_L1HDR(hdr)) { - hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, - hdr_full_cache); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - /* - * This is a delicate dance that we play here. - * This hdr is in the ghost list so we access it - * to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); - arc_hdr_alloc_pabd(hdr, B_FALSE); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - size = arc_hdr_size(hdr); - - /* - * If compression is enabled on the hdr, then will do - * RAW I/O and will store the compressed data in the hdr's - * data block. Otherwise, the hdr's data block will contain - * the uncompressed data. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - zio_flags |= ZIO_FLAG_RAW; - } - - if (*arc_flags & ARC_FLAG_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (BP_GET_LEVEL(bp) > 0) - arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); - ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); - - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_compressed = compressed_read; - - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - - if (HDR_HAS_L2HDR(hdr) && - (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr.b_dev->l2ad_writing; - addr = hdr->b_l2hdr.b_daddr; - /* - * Lock out L2ARC device removal. - */ - if (vdev_is_dead(vd) || - !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) - vd = NULL; - } - - /* - * We count both async reads and scrub IOs as asynchronous so - * that both can be upgraded in the event of a cache hit while - * the read IO is still in-flight. - */ - if (priority == ZIO_PRIORITY_ASYNC_READ || - priority == ZIO_PRIORITY_SCRUB) - arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - else - arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - - /* - * At this point, we have a level 1 cache miss. Try again in - * L2ARC if possible. - */ - ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); - - DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, lsize, zbookmark_phys_t *, zb); - ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, misses); -#ifdef _KERNEL -#ifdef RACCT - if (racct_enable) { - PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_READBPS, size); - racct_add_force(curproc, RACCT_READIOPS, 1); - PROC_UNLOCK(curproc); - } -#endif /* RACCT */ - curthread->td_ru.ru_inblock++; -#endif - - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { - /* - * Read from the L2ARC if the following are true: - * 1. The L2ARC vdev was previously cached. - * 2. This buffer still has L2ARC metadata. - * 3. This buffer isn't currently writing to the L2ARC. - * 4. The L2ARC entry wasn't evicted, which may - * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. - */ - if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { - l2arc_read_callback_t *cb; - abd_t *abd; - uint64_t asize; - - DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr.b_hits); - - cb = kmem_zalloc(sizeof (l2arc_read_callback_t), - KM_SLEEP); - cb->l2rcb_hdr = hdr; - cb->l2rcb_bp = *bp; - cb->l2rcb_zb = *zb; - cb->l2rcb_flags = zio_flags; - - asize = vdev_psize_to_asize(vd, size); - if (asize != size) { - abd = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - cb->l2rcb_abd = abd; - } else { - abd = hdr->b_l1hdr.b_pabd; - } - - ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + asize <= vd->vdev_psize - - VDEV_LABEL_END_SIZE); - - /* - * l2arc read. The SCL_L2ARC lock will be - * released by l2arc_read_done(). - * Issue a null zio if the underlying buffer - * was squashed to zero size by compression. - */ - ASSERT3U(HDR_GET_COMPRESS(hdr), !=, - ZIO_COMPRESS_EMPTY); - rzio = zio_read_phys(pio, vd, addr, - asize, abd, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - acb->acb_zio_head = rzio; - - if (hash_lock != NULL) - mutex_exit(hash_lock); - - DTRACE_PROBE2(l2arc__read, vdev_t *, vd, - zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, size); - - if (*arc_flags & ARC_FLAG_NOWAIT) { - zio_nowait(rzio); - return (0); - } - - ASSERT(*arc_flags & ARC_FLAG_WAIT); - if (zio_wait(rzio) == 0) - return (0); - - /* l2arc read error; goto zio_read() */ - if (hash_lock != NULL) - mutex_enter(hash_lock); - } else { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); - if (HDR_L2_WRITING(hdr)) - ARCSTAT_BUMP(arcstat_l2_rw_clash); - spa_config_exit(spa, SCL_L2ARC, vd); - } - } else { - if (vd != NULL) - spa_config_exit(spa, SCL_L2ARC, vd); - if (l2arc_ndev != 0) { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); - } - } - - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, - arc_read_done, hdr, priority, zio_flags, zb); - acb->acb_zio_head = rzio; - - if (hash_lock != NULL) - mutex_exit(hash_lock); - - if (*arc_flags & ARC_FLAG_WAIT) - return (zio_wait(rzio)); - - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - zio_nowait(rzio); - } - return (0); -} - -arc_prune_t * -arc_add_prune_callback(arc_prune_func_t *func, void *private) -{ - arc_prune_t *p; - - p = kmem_alloc(sizeof (*p), KM_SLEEP); - p->p_pfunc = func; - p->p_private = private; - list_link_init(&p->p_node); - zfs_refcount_create(&p->p_refcnt); - - mutex_enter(&arc_prune_mtx); - zfs_refcount_add(&p->p_refcnt, &arc_prune_list); - list_insert_head(&arc_prune_list, p); - mutex_exit(&arc_prune_mtx); - - return (p); -} - -void -arc_remove_prune_callback(arc_prune_t *p) -{ - boolean_t wait = B_FALSE; - mutex_enter(&arc_prune_mtx); - list_remove(&arc_prune_list, p); - if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) - wait = B_TRUE; - mutex_exit(&arc_prune_mtx); - - /* wait for arc_prune_task to finish */ - if (wait) - taskq_wait(arc_prune_taskq); - ASSERT0(zfs_refcount_count(&p->p_refcnt)); - zfs_refcount_destroy(&p->p_refcnt); - kmem_free(p, sizeof (*p)); -} - -/* - * Notify the arc that a block was freed, and thus will never be used again. - */ -void -arc_freed(spa_t *spa, const blkptr_t *bp) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - uint64_t guid = spa_load_guid(spa); - - ASSERT(!BP_IS_EMBEDDED(bp)); - - hdr = buf_hash_find(guid, bp, &hash_lock); - if (hdr == NULL) - return; - - /* - * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is - * part of a dedup-ed, dmu_synced write. The dmu_sync() function would - * have written the new block to its final resting place on disk but - * without the dedup flag set. This would have left the hdr in the MRU - * state and discoverable. When the txg finally syncs it detects that - * the block was overridden in open context and issues an override I/O. - * Since this is a dedup block, the override I/O will determine if the - * block is already in the DDT. If so, then it will replace the io_bp - * with the bp from the DDT and allow the I/O to finish. When the I/O - * reaches the done callback, dbuf_write_override_done, it will - * check to see if the io_bp and io_bp_override are identical. - * If they are not, then it indicates that the bp was replaced with - * the bp in the DDT and the override bp is freed. This allows - * us to arrive here with a reference on a block that is being - * freed. So if we have an I/O in progress, or a reference to - * this hdr, then we don't destroy the hdr. - */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - mutex_exit(hash_lock); - } else { - mutex_exit(hash_lock); - } - -} - -/* - * Release this buffer from the cache, making it an anonymous buffer. This - * must be done after a read and prior to modifying the buffer contents. - * If the buffer has more than one reference, we must make - * a new hdr for the buffer. - */ -void -arc_release(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * It would be nice to assert that if it's DMU metadata (level > - * 0 || it's the dnode file), then it must be syncing context. - * But we don't know that information at this level. - */ - - mutex_enter(&buf->b_evict_lock); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We don't grab the hash lock prior to this check, because if - * the buffer's header is in the arc_anon state, it won't be - * linked into the hash table. - */ - if (hdr->b_l1hdr.b_state == arc_anon) { - mutex_exit(&buf->b_evict_lock); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - - hdr->b_l1hdr.b_arc_access = 0; - - /* - * If the buf is being overridden then it may already - * have a hdr that is not empty. - */ - buf_discard_identity(hdr); - arc_buf_thaw(buf); - - return; - } - - kmutex_t *hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - /* - * This assignment is only valid as long as the hash_lock is - * held, we must be careful not to reference state or the - * b_state field after dropping the lock. - */ - arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(state, !=, arc_anon); - - /* this buffer is not on any list */ - ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); - - if (HDR_HAS_L2HDR(hdr)) { - mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); - - /* - * We have to recheck this conditional again now that - * we're holding the l2ad_mtx to prevent a race with - * another thread which might be concurrently calling - * l2arc_evict(). In that case, l2arc_evict() might have - * destroyed the header's L2 portion as we were waiting - * to acquire the l2ad_mtx. - */ - if (HDR_HAS_L2HDR(hdr)) { - l2arc_trim(hdr); - arc_hdr_l2hdr_destroy(hdr); - } - - mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); - } - - /* - * Do we have more than one buf? - */ - if (hdr->b_l1hdr.b_bufcnt > 1) { - arc_buf_hdr_t *nhdr; - uint64_t spa = hdr->b_spa; - uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); - enum zio_compress compress = HDR_GET_COMPRESS(hdr); - arc_buf_contents_t type = arc_buf_type(hdr); - VERIFY3U(hdr->b_type, ==, type); - - ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); - - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { - ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(buf)); - } - - /* - * Pull the data off of this hdr and attach it to - * a new anonymous hdr. Also find the last buffer - * in the hdr's buffer list. - */ - arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); - ASSERT3P(lastbuf, !=, NULL); - - /* - * If the current arc_buf_t and the hdr are sharing their data - * buffer, then we must stop sharing that block. - */ - if (arc_buf_is_shared(buf)) { - VERIFY(!arc_buf_is_shared(lastbuf)); - - /* - * First, sever the block sharing relationship between - * buf and the arc_buf_hdr_t. - */ - arc_unshare_buf(hdr, buf); - - /* - * Now we need to recreate the hdr's b_pabd. Since we - * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pabd and copy the - * data from buf into it. - */ - if (arc_can_share(hdr, lastbuf)) { - arc_share_buf(hdr, lastbuf); - } else { - arc_hdr_alloc_pabd(hdr, B_TRUE); - abd_copy_from_buf(hdr->b_l1hdr.b_pabd, - buf->b_data, psize); - } - VERIFY3P(lastbuf->b_data, !=, NULL); - } else if (HDR_SHARED_DATA(hdr)) { - /* - * Uncompressed shared buffers are always at the end - * of the list. Compressed buffers don't have the - * same requirements. This makes it hard to - * simply assert that the lastbuf is shared so - * we rely on the hdr's compression flags to determine - * if we have a compressed, shared buffer. - */ - ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT3P(state, !=, arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_size, - arc_buf_size(buf), buf); - - if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many( - &state->arcs_esize[type], - arc_buf_size(buf), buf); - } - - hdr->b_l1hdr.b_bufcnt -= 1; - arc_cksum_verify(buf); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - - mutex_exit(hash_lock); - - /* - * Allocate a new hdr. The new hdr will contain a b_pabd - * buffer which will be freed in arc_write(). - */ - nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); - ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); - ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); - VERIFY3U(nhdr->b_type, ==, type); - ASSERT(!HDR_SHARED_DATA(nhdr)); - - nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); - buf->b_hdr = nhdr; - - mutex_exit(&buf->b_evict_lock); - (void) zfs_refcount_add_many(&arc_anon->arcs_size, - arc_buf_size(buf), buf); - } else { - mutex_exit(&buf->b_evict_lock); - ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); - /* protected by hash lock, or hdr is on arc_anon */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_l1hdr.b_arc_access = 0; - mutex_exit(hash_lock); - - buf_discard_identity(hdr); - arc_buf_thaw(buf); - } -} - -int -arc_released(arc_buf_t *buf) -{ - int released; - - mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && - buf->b_hdr->b_l1hdr.b_state == arc_anon); - mutex_exit(&buf->b_evict_lock); - return (released); -} - -#ifdef ZFS_DEBUG -int -arc_referenced(arc_buf_t *buf) -{ - int referenced; - - mutex_enter(&buf->b_evict_lock); - referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); - mutex_exit(&buf->b_evict_lock); - return (referenced); -} -#endif - -static void -arc_write_ready(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - - /* - * If we're reexecuting this zio because the pool suspended, then - * cleanup any state that was previously set the first time the - * callback was invoked. - */ - if (zio->io_flags & ZIO_FLAG_REEXECUTED) { - arc_cksum_free(hdr); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { - arc_unshare_buf(hdr, buf); - } else { - arc_hdr_free_pabd(hdr); - } - } - } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); - - callback->awcb_ready(zio, buf, callback->awcb_private); - - if (HDR_IO_IN_PROGRESS(hdr)) - ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_cksum_compute(buf); - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - - enum zio_compress compress; - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { - compress = ZIO_COMPRESS_OFF; - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); - compress = BP_GET_COMPRESS(zio->io_bp); - } - HDR_SET_PSIZE(hdr, psize); - arc_hdr_set_compress(hdr, compress); - - - /* - * Fill the hdr with data. If the hdr is compressed, the data we want - * is available from the zio, otherwise we can take it from the buf. - * - * We might be able to share the buf's data with the hdr here. However, - * doing so would cause the ARC to be full of linear ABDs if we write a - * lot of shareable data. As a compromise, we check whether scattered - * ABDs are allowed, and assume that if they are then the user wants - * the ARC to be primarily filled with them regardless of the data being - * written. Therefore, if they're allowed then we allocate one and copy - * the data into it; otherwise, we share the data directly if we can. - */ - if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { - arc_hdr_alloc_pabd(hdr, B_TRUE); - - /* - * Ideally, we would always copy the io_abd into b_pabd, but the - * user may have disabled compressed ARC, thus we must check the - * hdr's compression setting rather than the io_bp's. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, - ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - - abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); - } else { - ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - - abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, - arc_buf_size(buf)); - } - } else { - ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); - ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - - arc_share_buf(hdr, buf); - } - - arc_hdr_verify(hdr, zio->io_bp); -} - -static void -arc_write_children_ready(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - - callback->awcb_children_ready(zio, buf, callback->awcb_private); -} - -/* - * The SPA calls this callback for each physical write that happens on behalf - * of a logical write. See the comment in dbuf_write_physdone() for details. - */ -static void -arc_write_physdone(zio_t *zio) -{ - arc_write_callback_t *cb = zio->io_private; - if (cb->awcb_physdone != NULL) - cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -} - -static void -arc_write_done(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - - if (zio->io_error == 0) { - arc_hdr_verify(hdr, zio->io_bp); - - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { - buf_discard_identity(hdr); - } else { - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - } - } else { - ASSERT(HDR_EMPTY(hdr)); - } - - /* - * If the block to be written was all-zero or compressed enough to be - * embedded in the BP, no write was performed so there will be no - * dva/birth/checksum. The buffer must therefore remain anonymous - * (and uncached). - */ - if (!HDR_EMPTY(hdr)) { - arc_buf_hdr_t *exists; - kmutex_t *hash_lock; - - ASSERT3U(zio->io_error, ==, 0); - - arc_cksum_verify(buf); - - exists = buf_hash_insert(hdr, &hash_lock); - if (exists != NULL) { - /* - * This can only happen if we overwrite for - * sync-to-convergence, because we remove - * buffers from the hash table when we arc_free(). - */ - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) - panic("bad overwrite, hdr=%p exists=%p", - (void *)hdr, (void *)exists); - ASSERT(zfs_refcount_is_zero( - &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); - } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { - /* nopwrite */ - ASSERT(zio->io_prop.zp_nopwrite); - if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) - panic("bad nopwrite, hdr=%p exists=%p", - (void *)hdr, (void *)exists); - } else { - /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); - ASSERT(hdr->b_l1hdr.b_state == arc_anon); - ASSERT(BP_GET_DEDUP(zio->io_bp)); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - } - } - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - /* if it's not anon, we are doing a scrub */ - if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - } - - ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - - abd_put(zio->io_abd); - kmem_free(callback, sizeof (arc_write_callback_t)); -} - -zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, - arc_write_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_write_callback_t *callback; - zio_t *zio; - zio_prop_t localprop = *zp; - - ASSERT3P(ready, !=, NULL); - ASSERT3P(done, !=, NULL); - ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - if (l2arc) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (ARC_BUF_COMPRESSED(buf)) { - /* - * We're writing a pre-compressed buffer. Make the - * compression algorithm requested by the zio_prop_t match - * the pre-compressed buffer's compression algorithm. - */ - localprop.zp_compress = HDR_GET_COMPRESS(hdr); - - ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); - zio_flags |= ZIO_FLAG_RAW; - } - callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); - callback->awcb_ready = ready; - callback->awcb_children_ready = children_ready; - callback->awcb_physdone = physdone; - callback->awcb_done = done; - callback->awcb_private = private; - callback->awcb_buf = buf; - - /* - * The hdr's b_pabd is now stale, free it now. A new data block - * will be allocated when the zio pipeline calls arc_write_ready(). - */ - if (hdr->b_l1hdr.b_pabd != NULL) { - /* - * If the buf is currently sharing the data block with - * the hdr then we need to break that relationship here. - * The hdr will remain with a NULL data pointer and the - * buf will take sole ownership of the block. - */ - if (arc_buf_is_shared(buf)) { - arc_unshare_buf(hdr, buf); - } else { - arc_hdr_free_pabd(hdr); - } - VERIFY3P(buf->b_data, !=, NULL); - arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); - } - ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - - zio = zio_write(pio, spa, txg, bp, - abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), - HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, - (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_physdone, arc_write_done, callback, - priority, zio_flags, zb); - - return (zio); -} - -static int -arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) -{ -#ifdef _KERNEL - uint64_t available_memory = ptob(freemem); - -#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) - available_memory = MIN(available_memory, uma_avail()); -#endif - - if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) - return (0); - - if (txg > spa->spa_lowmem_last_txg) { - spa->spa_lowmem_last_txg = txg; - spa->spa_lowmem_page_load = 0; - } - /* - * If we are in pageout, we know that memory is already tight, - * the arc is already going to be evicting, so we just want to - * continue to let page writes occur as quickly as possible. - */ - if (curproc == pageproc) { - if (spa->spa_lowmem_page_load > - MAX(ptob(minfree), available_memory) / 4) - return (SET_ERROR(ERESTART)); - /* Note: reserve is inflated, so we deflate */ - atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); - return (0); - } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { - /* memory is low, delay before restarting */ - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (SET_ERROR(EAGAIN)); - } - spa->spa_lowmem_page_load = 0; -#endif /* _KERNEL */ - return (0); -} - -void -arc_tempreserve_clear(uint64_t reserve) -{ - atomic_add_64(&arc_tempreserve, -reserve); - ASSERT((int64_t)arc_tempreserve >= 0); -} - -int -arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) -{ - int error; - uint64_t anon_size; - - if (reserve > arc_c/4 && !arc_no_grow) { - arc_c = MIN(arc_c_max, reserve * 4); - DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); - } - if (reserve > arc_c) - return (SET_ERROR(ENOMEM)); - - /* - * Don't count loaned bufs as in flight dirty data to prevent long - * network delays from blocking transactions that are ready to be - * assigned to a txg. - */ - - /* assert that it has not wrapped around */ - ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - - arc_loaned_bytes), 0); - - /* - * Writes will, almost always, require additional memory allocations - * in order to compress/encrypt/etc the data. We therefore need to - * make sure that there is sufficient available memory for this. - */ - error = arc_memory_throttle(spa, reserve, txg); - if (error != 0) - return (error); - - /* - * Throttle writes when the amount of dirty data in the cache - * gets too large. We try to keep the cache less than half full - * of dirty blocks so that our sync times don't grow too large. - * - * In the case of one pool being built on another pool, we want - * to make sure we don't end up throttling the lower (backing) - * pool when the upper pool is the majority contributor to dirty - * data. To insure we make forward progress during throttling, we - * also check the current pool's net dirty data and only throttle - * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty - * data in the cache. - * - * Note: if two requests come in concurrently, we might let them - * both succeed, when one of them should fail. Not a huge deal. - */ - uint64_t total_dirty = reserve + arc_tempreserve + anon_size; - uint64_t spa_dirty_anon = spa_dirty_data(spa); - - if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && - anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && - spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { - uint64_t meta_esize = - zfs_refcount_count( - &arc_anon->arcs_esize[ARC_BUFC_METADATA]); - uint64_t data_esize = - zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " - "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve >> 10, meta_esize >> 10, - data_esize >> 10, reserve >> 10, arc_c >> 10); - return (SET_ERROR(ERESTART)); - } - atomic_add_64(&arc_tempreserve, reserve); - return (0); -} - -static void -arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, - kstat_named_t *evict_data, kstat_named_t *evict_metadata) -{ - size->value.ui64 = zfs_refcount_count(&state->arcs_size); - evict_data->value.ui64 = - zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); - evict_metadata->value.ui64 = - zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); -} - -static int -arc_kstat_update(kstat_t *ksp, int rw) -{ - arc_stats_t *as = ksp->ks_data; - - if (rw == KSTAT_WRITE) { - return (EACCES); - } else { - arc_kstat_update_state(arc_anon, - &as->arcstat_anon_size, - &as->arcstat_anon_evictable_data, - &as->arcstat_anon_evictable_metadata); - arc_kstat_update_state(arc_mru, - &as->arcstat_mru_size, - &as->arcstat_mru_evictable_data, - &as->arcstat_mru_evictable_metadata); - arc_kstat_update_state(arc_mru_ghost, - &as->arcstat_mru_ghost_size, - &as->arcstat_mru_ghost_evictable_data, - &as->arcstat_mru_ghost_evictable_metadata); - arc_kstat_update_state(arc_mfu, - &as->arcstat_mfu_size, - &as->arcstat_mfu_evictable_data, - &as->arcstat_mfu_evictable_metadata); - arc_kstat_update_state(arc_mfu_ghost, - &as->arcstat_mfu_ghost_size, - &as->arcstat_mfu_ghost_evictable_data, - &as->arcstat_mfu_ghost_evictable_metadata); - - ARCSTAT(arcstat_size) = aggsum_value(&arc_size); - ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); - ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); - ARCSTAT(arcstat_metadata_size) = - aggsum_value(&astat_metadata_size); - ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); - ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); - ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); - ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) + - aggsum_value(&astat_dnode_size) + - aggsum_value(&astat_dbuf_size); -#endif - ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); - } - - return (0); -} - -/* - * This function *must* return indices evenly distributed between all - * sublists of the multilist. This is needed due to how the ARC eviction - * code is laid out; arc_evict_state() assumes ARC buffers are evenly - * distributed between all sublists and uses this assumption when - * deciding which sublist to evict from and how much to evict from it. - */ -unsigned int -arc_state_multilist_index_func(multilist_t *ml, void *obj) -{ - arc_buf_hdr_t *hdr = obj; - - /* - * We rely on b_dva to generate evenly distributed index - * numbers using buf_hash below. So, as an added precaution, - * let's make sure we never add empty buffers to the arc lists. - */ - ASSERT(!HDR_EMPTY(hdr)); - - /* - * The assumption here, is the hash value for a given - * arc_buf_hdr_t will remain constant throughout it's lifetime - * (i.e. it's b_spa, b_dva, and b_birth fields don't change). - * Thus, we don't need to store the header's sublist index - * on insertion, as this index can be recalculated on removal. - * - * Also, the low order bits of the hash value are thought to be - * distributed evenly. Otherwise, in the case that the multilist - * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. - */ - return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % - multilist_get_num_sublists(ml)); -} - -#ifdef _KERNEL -static eventhandler_tag arc_event_lowmem = NULL; - -static void -arc_lowmem(void *arg __unused, int howto __unused) -{ - int64_t free_memory, to_free; - - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; - arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); - free_memory = arc_available_memory(); - to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); - DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); - arc_reduce_target_size(to_free); - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - - /* - * It is unsafe to block here in arbitrary threads, because we can come - * here from ARC itself and may hold ARC locks and thus risk a deadlock - * with ARC reclaim thread. - */ - if (curproc == pageproc) - (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); - mutex_exit(&arc_adjust_lock); -} -#endif - -static void -arc_state_init(void) -{ - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - - arc_mru->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - - zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); - - aggsum_init(&arc_meta_used, 0); - aggsum_init(&arc_size, 0); - aggsum_init(&astat_data_size, 0); - aggsum_init(&astat_metadata_size, 0); - aggsum_init(&astat_hdr_size, 0); - aggsum_init(&astat_bonus_size, 0); - aggsum_init(&astat_dnode_size, 0); - aggsum_init(&astat_dbuf_size, 0); - aggsum_init(&astat_l2_hdr_size, 0); -} - -static void -arc_state_fini(void) -{ - zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - aggsum_fini(&arc_meta_used); - aggsum_fini(&arc_size); - aggsum_fini(&astat_data_size); - aggsum_fini(&astat_metadata_size); - aggsum_fini(&astat_hdr_size); - aggsum_fini(&astat_bonus_size); - aggsum_fini(&astat_dnode_size); - aggsum_fini(&astat_dbuf_size); - aggsum_fini(&astat_l2_hdr_size); -} - -uint64_t -arc_max_bytes(void) -{ - return (arc_c_max); -} - -void -arc_init(void) -{ - int i, prefetch_tunable_set = 0; - - /* - * allmem is "all memory that we could possibly use". - */ -#ifdef illumos -#ifdef _KERNEL - uint64_t allmem = ptob(physmem - swapfs_minfree); -#else - uint64_t allmem = (physmem * PAGESIZE) / 2; -#endif -#else - uint64_t allmem = kmem_size(); -#endif - mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); - - mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); - - /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ - arc_c_min = MAX(allmem / 32, arc_abs_min); - /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ - if (allmem >= 1 << 30) - arc_c_max = allmem - (1 << 30); - else - arc_c_max = arc_c_min; - arc_c_max = MAX(allmem * 5 / 8, arc_c_max); - - /* - * In userland, there's only the memory pressure that we artificially - * create (see arc_available_memory()). Don't let arc_c get too - * small, because it can cause transactions to be larger than - * arc_c, causing arc_tempreserve_space() to fail. - */ -#ifndef _KERNEL - arc_c_min = arc_c_max / 2; -#endif - -#ifdef _KERNEL - /* - * Allow the tunables to override our calculations if they are - * reasonable. - */ - if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { - arc_c_max = zfs_arc_max; - arc_c_min = MIN(arc_c_min, arc_c_max); - } - if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; -#endif - - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; - -#ifdef _KERNEL - /* - * Metadata is stored in the kernel's heap. Don't let us - * use more than half the heap for the ARC. - */ -#ifdef __FreeBSD__ - arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2); - arc_dnode_limit = arc_meta_limit / 10; -#else - arc_meta_limit = MIN(arc_meta_limit, - vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); -#endif -#endif - - /* Allow the tunable to override if it is reasonable */ - if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) - arc_meta_limit = zfs_arc_meta_limit; - - if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) - arc_c_min = arc_meta_limit / 2; - - if (zfs_arc_meta_min > 0) { - arc_meta_min = zfs_arc_meta_min; - } else { - arc_meta_min = arc_c_min / 2; - } - - /* Valid range: <arc_meta_min> - <arc_c_max> */ - if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) && - (zfs_arc_dnode_limit >= zfs_arc_meta_min) && - (zfs_arc_dnode_limit <= arc_c_max)) - arc_dnode_limit = zfs_arc_dnode_limit; - - if (zfs_arc_grow_retry > 0) - arc_grow_retry = zfs_arc_grow_retry; - - if (zfs_arc_shrink_shift > 0) - arc_shrink_shift = zfs_arc_shrink_shift; - - if (zfs_arc_no_grow_shift > 0) - arc_no_grow_shift = zfs_arc_no_grow_shift; - /* - * Ensure that arc_no_grow_shift is less than arc_shrink_shift. - */ - if (arc_no_grow_shift >= arc_shrink_shift) - arc_no_grow_shift = arc_shrink_shift - 1; - - if (zfs_arc_p_min_shift > 0) - arc_p_min_shift = zfs_arc_p_min_shift; - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - if (arc_c < arc_c_min) - arc_c = arc_c_min; - - zfs_arc_min = arc_c_min; - zfs_arc_max = arc_c_max; - - arc_state_init(); - - /* - * The arc must be "uninitialized", so that hdr_recl() (which is - * registered by buf_init()) will not access arc_reap_zthr before - * it is created. - */ - ASSERT(!arc_initialized); - buf_init(); - - list_create(&arc_prune_list, sizeof (arc_prune_t), - offsetof(arc_prune_t, p_node)); - mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - - arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri, - max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - - arc_dnlc_evicts_thread_exit = FALSE; - - arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, - sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - - if (arc_ksp != NULL) { - arc_ksp->ks_data = &arc_stats; - arc_ksp->ks_update = arc_kstat_update; - kstat_install(arc_ksp); - } - - arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check, - arc_adjust_cb, NULL, SEC2NSEC(1)); - arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, - arc_reap_cb, NULL, SEC2NSEC(1)); - -#ifdef _KERNEL - arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, - EVENTHANDLER_PRI_FIRST); -#endif - - (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - - arc_initialized = B_TRUE; - arc_warm = B_FALSE; - - /* - * Calculate maximum amount of dirty data per pool. - * - * If it has been set by /etc/system, take that. - * Otherwise, use a percentage of physical memory defined by - * zfs_dirty_data_max_percent (default 10%) with a cap at - * zfs_dirty_data_max_max (default 4GB). - */ - if (zfs_dirty_data_max == 0) { - zfs_dirty_data_max = ptob(physmem) * - zfs_dirty_data_max_percent / 100; - zfs_dirty_data_max = MIN(zfs_dirty_data_max, - zfs_dirty_data_max_max); - } - -#ifdef _KERNEL - if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) - prefetch_tunable_set = 1; - -#ifdef __i386__ - if (prefetch_tunable_set == 0) { - printf("ZFS NOTICE: Prefetch is disabled by default on i386 " - "-- to enable,\n"); - printf(" add \"vfs.zfs.prefetch_disable=0\" " - "to /boot/loader.conf.\n"); - zfs_prefetch_disable = 1; - } -#else - if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && - prefetch_tunable_set == 0) { - printf("ZFS NOTICE: Prefetch is disabled by default if less " - "than 4GB of RAM is present;\n" - " to enable, add \"vfs.zfs.prefetch_disable=0\" " - "to /boot/loader.conf.\n"); - zfs_prefetch_disable = 1; - } -#endif - /* Warn about ZFS memory and address space requirements. */ - if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " - "expect unstable behavior.\n"); - } - if (allmem < 512 * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " - "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " - "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); - } -#endif -} - -void -arc_fini(void) -{ - arc_prune_t *p; - -#ifdef _KERNEL - if (arc_event_lowmem != NULL) - EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#endif - - /* Use B_TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, B_TRUE); - - mutex_enter(&arc_dnlc_evicts_lock); - arc_dnlc_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_dnlc_evicts_thread_exit) { - cv_signal(&arc_dnlc_evicts_cv); - cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); - } - mutex_exit(&arc_dnlc_evicts_lock); - - arc_initialized = B_FALSE; - - if (arc_ksp != NULL) { - kstat_delete(arc_ksp); - arc_ksp = NULL; - } - - taskq_wait(arc_prune_taskq); - taskq_destroy(arc_prune_taskq); - - mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); - zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); - zfs_refcount_destroy(&p->p_refcnt); - kmem_free(p, sizeof (*p)); - } - mutex_exit(&arc_prune_mtx); - - list_destroy(&arc_prune_list); - mutex_destroy(&arc_prune_mtx); - - (void) zthr_cancel(arc_adjust_zthr); - zthr_destroy(arc_adjust_zthr); - - mutex_destroy(&arc_dnlc_evicts_lock); - cv_destroy(&arc_dnlc_evicts_cv); - - (void) zthr_cancel(arc_reap_zthr); - zthr_destroy(arc_reap_zthr); - - mutex_destroy(&arc_adjust_lock); - cv_destroy(&arc_adjust_waiters_cv); - - /* - * buf_fini() must proceed arc_state_fini() because buf_fin() may - * trigger the release of kmem magazines, which can callback to - * arc_space_return() which accesses aggsums freed in act_state_fini(). - */ - buf_fini(); - arc_state_fini(); - - ASSERT0(arc_loaned_bytes); -} - -/* - * Level 2 ARC - * - * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. - * It uses dedicated storage devices to hold cached data, which are populated - * using large infrequent writes. The main role of this cache is to boost - * the performance of random read workloads. The intended L2ARC devices - * include short-stroked disks, solid state disks, and other media with - * substantially faster read latency than disk. - * - * +-----------------------+ - * | ARC | - * +-----------------------+ - * | ^ ^ - * | | | - * l2arc_feed_thread() arc_read() - * | | | - * | l2arc read | - * V | | - * +---------------+ | - * | L2ARC | | - * +---------------+ | - * | ^ | - * l2arc_write() | | - * | | | - * V | | - * +-------+ +-------+ - * | vdev | | vdev | - * | cache | | cache | - * +-------+ +-------+ - * +=========+ .-----. - * : L2ARC : |-_____-| - * : devices : | Disks | - * +=========+ `-_____-' - * - * Read requests are satisfied from the following sources, in order: - * - * 1) ARC - * 2) vdev cache of L2ARC devices - * 3) L2ARC devices - * 4) vdev cache of disks - * 5) disks - * - * Some L2ARC device types exhibit extremely slow write performance. - * To accommodate for this there are some significant differences between - * the L2ARC and traditional cache design: - * - * 1. There is no eviction path from the ARC to the L2ARC. Evictions from - * the ARC behave as usual, freeing buffers and placing headers on ghost - * lists. The ARC does not send buffers to the L2ARC during eviction as - * this would add inflated write latencies for all ARC memory pressure. - * - * 2. The L2ARC attempts to cache data from the ARC before it is evicted. - * It does this by periodically scanning buffers from the eviction-end of - * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are - * not already there. It scans until a headroom of buffers is satisfied, - * which itself is a buffer for ARC eviction. If a compressible buffer is - * found during scanning and selected for writing to an L2ARC device, we - * temporarily boost scanning headroom during the next scan cycle to make - * sure we adapt to compression effects (which might significantly reduce - * the data volume we write to L2ARC). The thread that does this is - * l2arc_feed_thread(), illustrated below; example sizes are included to - * provide a better sense of ratio than this diagram: - * - * head --> tail - * +---------------------+----------+ - * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC - * +---------------------+----------+ | o L2ARC eligible - * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer - * +---------------------+----------+ | - * 15.9 Gbytes ^ 32 Mbytes | - * headroom | - * l2arc_feed_thread() - * | - * l2arc write hand <--[oooo]--' - * | 8 Mbyte - * | write max - * V - * +==============================+ - * L2ARC dev |####|#|###|###| |####| ... | - * +==============================+ - * 32 Gbytes - * - * 3. If an ARC buffer is copied to the L2ARC but then hit instead of - * evicted, then the L2ARC has cached a buffer much sooner than it probably - * needed to, potentially wasting L2ARC device bandwidth and storage. It is - * safe to say that this is an uncommon case, since buffers at the end of - * the ARC lists have moved there due to inactivity. - * - * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, - * then the L2ARC simply misses copying some buffers. This serves as a - * pressure valve to prevent heavy read workloads from both stalling the ARC - * with waits and clogging the L2ARC with writes. This also helps prevent - * the potential for the L2ARC to churn if it attempts to cache content too - * quickly, such as during backups of the entire pool. - * - * 5. After system boot and before the ARC has filled main memory, there are - * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru - * lists can remain mostly static. Instead of searching from tail of these - * lists as pictured, the l2arc_feed_thread() will search from the list heads - * for eligible buffers, greatly increasing its chance of finding them. - * - * The L2ARC device write speed is also boosted during this time so that - * the L2ARC warms up faster. Since there have been no ARC evictions yet, - * there are no L2ARC reads, and no fear of degrading read performance - * through increased writes. - * - * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that - * the vdev queue can aggregate them into larger and fewer writes. Each - * device is written to in a rotor fashion, sweeping writes through - * available space then repeating. - * - * 7. The L2ARC does not store dirty content. It never needs to flush - * write buffers back to disk based storage. - * - * 8. If an ARC buffer is written (and dirtied) which also exists in the - * L2ARC, the now stale L2ARC buffer is immediately dropped. - * - * The performance of the L2ARC can be tweaked by a number of tunables, which - * may be necessary for different workloads: - * - * l2arc_write_max max write bytes per interval - * l2arc_write_boost extra write bytes during device warmup - * l2arc_noprefetch skip caching prefetched buffers - * l2arc_headroom number of max device writes to precache - * l2arc_headroom_boost when we find compressed buffers during ARC - * scanning, we multiply headroom by this - * percentage factor for the next scan cycle, - * since more compressed buffers are likely to - * be present - * l2arc_feed_secs seconds between L2ARC writing - * - * Tunables may be removed or added as future performance improvements are - * integrated, and also may become zpool properties. - * - * There are three key functions that control how the L2ARC warms up: - * - * l2arc_write_eligible() check if a buffer is eligible to cache - * l2arc_write_size() calculate how much to write - * l2arc_write_interval() calculate sleep delay between writes - * - * These three functions determine what to write, how much, and how quickly - * to send writes. - */ - -static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) -{ - /* - * A buffer is *not* eligible for the L2ARC if it: - * 1. belongs to a different spa. - * 2. is already cached on the L2ARC. - * 3. has an I/O in progress (it may be an incomplete read). - * 4. is flagged not eligible (zfs property). - */ - if (hdr->b_spa != spa_guid) { - ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); - return (B_FALSE); - } - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_in_l2); - return (B_FALSE); - } - if (HDR_IO_IN_PROGRESS(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); - return (B_FALSE); - } - if (!HDR_L2CACHE(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); - return (B_FALSE); - } - - return (B_TRUE); -} - -static uint64_t -l2arc_write_size(void) -{ - uint64_t size; - - /* - * Make sure our globals have meaningful values in case the user - * altered them. - */ - size = l2arc_write_max; - if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); - size = l2arc_write_max = L2ARC_WRITE_SIZE; - } - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - - return (size); - -} - -static clock_t -l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) -{ - clock_t interval, next, now; - - /* - * If the ARC lists are busy, increase our write rate; if the - * lists are stale, idle back. This is achieved by checking - * how much we previously wrote - if it was more than half of - * what we wanted, schedule the next write much sooner. - */ - if (l2arc_feed_again && wrote > (wanted / 2)) - interval = (hz * l2arc_feed_min_ms) / 1000; - else - interval = hz * l2arc_feed_secs; - - now = ddi_get_lbolt(); - next = MAX(now, MIN(now + interval, began + interval)); - - return (next); -} - -/* - * Cycle through L2ARC devices. This is how L2ARC load balances. - * If a device is returned, this also returns holding the spa config lock. - */ -static l2arc_dev_t * -l2arc_dev_get_next(void) -{ - l2arc_dev_t *first, *next = NULL; - - /* - * Lock out the removal of spas (spa_namespace_lock), then removal - * of cache devices (l2arc_dev_mtx). Once a device has been selected, - * both locks will be dropped and a spa config lock held instead. - */ - mutex_enter(&spa_namespace_lock); - mutex_enter(&l2arc_dev_mtx); - - /* if there are no vdevs, there is nothing to do */ - if (l2arc_ndev == 0) - goto out; - - first = NULL; - next = l2arc_dev_last; - do { - /* loop around the list looking for a non-faulted vdev */ - if (next == NULL) { - next = list_head(l2arc_dev_list); - } else { - next = list_next(l2arc_dev_list, next); - if (next == NULL) - next = list_head(l2arc_dev_list); - } - - /* if we have come back to the start, bail out */ - if (first == NULL) - first = next; - else if (next == first) - break; - - } while (vdev_is_dead(next->l2ad_vdev)); - - /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) - next = NULL; - - l2arc_dev_last = next; - -out: - mutex_exit(&l2arc_dev_mtx); - - /* - * Grab the config lock to prevent the 'next' device from being - * removed while we are writing to it. - */ - if (next != NULL) - spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); - mutex_exit(&spa_namespace_lock); - - return (next); -} - -/* - * Free buffers that were tagged for destruction. - */ -static void -l2arc_do_free_on_write() -{ - list_t *buflist; - l2arc_data_free_t *df, *df_prev; - - mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_abd, !=, NULL); - abd_free(df->l2df_abd); - list_remove(buflist, df); - kmem_free(df, sizeof (l2arc_data_free_t)); - } - - mutex_exit(&l2arc_free_on_write_mtx); -} - -/* - * A write to a cache device has completed. Update all headers to allow - * reads from these buffers to begin. - */ -static void -l2arc_write_done(zio_t *zio) -{ - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; - - cb = zio->io_private; - ASSERT3P(cb, !=, NULL); - dev = cb->l2wcb_dev; - ASSERT3P(dev, !=, NULL); - head = cb->l2wcb_head; - ASSERT3P(head, !=, NULL); - buflist = &dev->l2ad_buflist; - ASSERT3P(buflist, !=, NULL); - DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, - l2arc_write_callback_t *, cb); - - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - - /* - * All writes completed, or an error was hit. - */ -top: - mutex_enter(&dev->l2ad_mtx); - for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(buflist, hdr); - - hash_lock = HDR_LOCK(hdr); - - /* - * We cannot use mutex_enter or else we can deadlock - * with l2arc_write_buffers (due to swapping the order - * the hash lock and l2ad_mtx are taken). - */ - if (!mutex_tryenter(hash_lock)) { - /* - * Missed the hash lock. We must retry so we - * don't leave the ARC_FLAG_L2_WRITING bit set. - */ - ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); - - /* - * We don't want to rescan the headers we've - * already marked as having been written out, so - * we reinsert the head node so we can pick up - * where we left off. - */ - list_remove(buflist, head); - list_insert_after(buflist, hdr, head); - - mutex_exit(&dev->l2ad_mtx); - - /* - * We wait for the hash lock to become available - * to try and prevent busy waiting, and increase - * the chance we'll be able to acquire the lock - * the next time around. - */ - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - - /* - * We could not have been moved into the arc_l2c_only - * state while in-flight due to our ARC_FLAG_L2_WRITING - * bit being set. Let's just ensure that's being enforced. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (zio->io_error != 0) { - /* - * Error - drop L2ARC entry. - */ - list_remove(buflist, hdr); - l2arc_trim(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - - ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - - bytes_dropped += arc_hdr_size(hdr); - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - } - - /* - * Allow ARC to begin reads and ghost list evictions to - * this L2ARC entry. - */ - arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); - - mutex_exit(hash_lock); - } - - atomic_inc_64(&l2arc_writes_done); - list_remove(buflist, head); - ASSERT(!HDR_HAS_L1HDR(head)); - kmem_cache_free(hdr_l2only_cache, head); - mutex_exit(&dev->l2ad_mtx); - - vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); - - l2arc_do_free_on_write(); - - kmem_free(cb, sizeof (l2arc_write_callback_t)); -} - -/* - * A read to a cache device completed. Validate buffer contents before - * handing over to the regular ARC routines. - */ -static void -l2arc_read_done(zio_t *zio) -{ - l2arc_read_callback_t *cb; - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - boolean_t valid_cksum; - - ASSERT3P(zio->io_vd, !=, NULL); - ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); - - spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); - - cb = zio->io_private; - ASSERT3P(cb, !=, NULL); - hdr = cb->l2rcb_hdr; - ASSERT3P(hdr, !=, NULL); - - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - /* - * If the data was read into a temporary buffer, - * move it and free the buffer. - */ - if (cb->l2rcb_abd != NULL) { - ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); - if (zio->io_error == 0) { - abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, - arc_hdr_size(hdr)); - } - - /* - * The following must be done regardless of whether - * there was an error: - * - free the temporary buffer - * - point zio to the real ARC buffer - * - set zio size accordingly - * These are required because zio is either re-used for - * an I/O of the block in the case of the error - * or the zio is passed to arc_read_done() and it - * needs real data. - */ - abd_free(cb->l2rcb_abd); - zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); - zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; - } - - ASSERT3P(zio->io_abd, !=, NULL); - - /* - * Check this survived the L2ARC journey. - */ - ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ - - valid_cksum = arc_cksum_is_equal(hdr, zio); - if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { - mutex_exit(hash_lock); - zio->io_private = hdr; - arc_read_done(zio); - } else { - /* - * Buffer didn't survive caching. Increment stats and - * reissue to the original storage device. - */ - if (zio->io_error != 0) { - ARCSTAT_BUMP(arcstat_l2_io_error); - } else { - zio->io_error = SET_ERROR(EIO); - } - if (!valid_cksum) - ARCSTAT_BUMP(arcstat_l2_cksum_bad); - - /* - * If there's no waiter, issue an async i/o to the primary - * storage now. If there *is* a waiter, the caller must - * issue the i/o in a context where it's OK to block. - */ - if (zio->io_waiter == NULL) { - zio_t *pio = zio_unique_parent(zio); - - ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - - zio = zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, - hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb); - for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; - acb != NULL; acb = acb->acb_next) - acb->acb_zio_head = zio; - mutex_exit(hash_lock); - zio_nowait(zio); - } else - mutex_exit(hash_lock); - } - - kmem_free(cb, sizeof (l2arc_read_callback_t)); -} - -/* - * This is the list priority from which the L2ARC will search for pages to - * cache. This is used within loops (0..3) to cycle through lists in the - * desired order. This order can have a significant effect on cache - * performance. - * - * Currently the metadata lists are hit first, MFU then MRU, followed by - * the data lists. This function returns a locked list, and also returns - * the lock pointer. - */ -static multilist_sublist_t * -l2arc_sublist_lock(int list_num) -{ - multilist_t *ml = NULL; - unsigned int idx; - - ASSERT(list_num >= 0 && list_num <= 3); - - switch (list_num) { - case 0: - ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; - break; - case 1: - ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; - break; - case 2: - ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; - break; - case 3: - ml = arc_mru->arcs_list[ARC_BUFC_DATA]; - break; - } - - /* - * Return a randomly-selected sublist. This is acceptable - * because the caller feeds only a little bit of data for each - * call (8MB). Subsequent calls will result in different - * sublists being selected. - */ - idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); -} - -/* - * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. - * This is clearing a region on the L2ARC device ready for writing. - * If the 'all' boolean is set, every buffer is evicted. - */ -static void -l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) -{ - list_t *buflist; - arc_buf_hdr_t *hdr, *hdr_prev; - kmutex_t *hash_lock; - uint64_t taddr; - - buflist = &dev->l2ad_buflist; - - if (!all && dev->l2ad_first) { - /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } - - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { - /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. - */ - taddr = dev->l2ad_end; - } else { - taddr = dev->l2ad_hand + distance; - } - DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, - uint64_t, taddr, boolean_t, all); - -top: - mutex_enter(&dev->l2ad_mtx); - for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(buflist, hdr); - - hash_lock = HDR_LOCK(hdr); - - /* - * We cannot use mutex_enter or else we can deadlock - * with l2arc_write_buffers (due to swapping the order - * the hash lock and l2ad_mtx are taken). - */ - if (!mutex_tryenter(hash_lock)) { - /* - * Missed the hash lock. Retry. - */ - ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&dev->l2ad_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - - /* - * A header can't be on this list if it doesn't have L2 header. - */ - ASSERT(HDR_HAS_L2HDR(hdr)); - - /* Ensure this header has finished being written. */ - ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || - hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { - /* - * We've evicted to the target address, - * or the end of the device. - */ - mutex_exit(hash_lock); - break; - } - - if (!HDR_HAS_L1HDR(hdr)) { - ASSERT(!HDR_L2_READING(hdr)); - /* - * This doesn't exist in the ARC. Destroy. - * arc_hdr_destroy() will call list_remove() - * and decrement arcstat_l2_lsize. - */ - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - } else { - ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); - ARCSTAT_BUMP(arcstat_l2_evict_l1cached); - /* - * Invalidate issued or about to be issued - * reads, since we may be about to write - * over this location. - */ - if (HDR_L2_READING(hdr)) { - ARCSTAT_BUMP(arcstat_l2_evict_reading); - arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); - } - - arc_hdr_l2hdr_destroy(hdr); - } - mutex_exit(hash_lock); - } - mutex_exit(&dev->l2ad_mtx); -} - -/* - * Find and write ARC buffers to the L2ARC device. - * - * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid - * for reading until they have completed writing. - * The headroom_boost is an in-out parameter used to maintain headroom boost - * state between calls to this function. - * - * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). - */ -static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) -{ - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); - int try; - - ASSERT3P(dev->l2ad_vdev, !=, NULL); - - pio = NULL; - write_lsize = write_asize = write_psize = 0; - full = B_FALSE; - head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); - - ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); - /* - * Copy buffers for L2ARC writing. - */ - for (try = 0; try <= 3; try++) { - multilist_sublist_t *mls = l2arc_sublist_lock(try); - uint64_t passed_sz = 0; - - ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); - - /* - * L2ARC fast warmup. - * - * Until the ARC is warm and starts to evict, read from the - * head of the ARC lists rather than the tail. - */ - if (arc_warm == B_FALSE) - hdr = multilist_sublist_head(mls); - else - hdr = multilist_sublist_tail(mls); - if (hdr == NULL) - ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); - - headroom = target_sz * l2arc_headroom; - if (zfs_compressed_arc_enabled) - headroom = (headroom * l2arc_headroom_boost) / 100; - - for (; hdr; hdr = hdr_prev) { - kmutex_t *hash_lock; - - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, - HDR_GET_LSIZE(hdr)); - - hash_lock = HDR_LOCK(hdr); - if (!mutex_tryenter(hash_lock)) { - ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); - /* - * Skip this buffer rather than waiting. - */ - continue; - } - - passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { - /* - * Searched too far. - */ - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); - break; - } - - if (!l2arc_write_eligible(guid, hdr)) { - mutex_exit(hash_lock); - continue; - } - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT3U(arc_hdr_size(hdr), >, 0); - uint64_t psize = arc_hdr_size(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, - psize); - - if ((write_asize + asize) > target_sz) { - full = B_TRUE; - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_l2_write_full); - break; - } - - if (pio == NULL) { - /* - * Insert a dummy header on the buflist so - * l2arc_write_done() can find where the - * write buffers begin without searching. - */ - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); - - cb = kmem_alloc( - sizeof (l2arc_write_callback_t), KM_SLEEP); - cb->l2wcb_dev = dev; - cb->l2wcb_head = head; - pio = zio_root(spa, l2arc_write_done, cb, - ZIO_FLAG_CANFAIL); - ARCSTAT_BUMP(arcstat_l2_write_pios); - } - - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - arc_hdr_set_flags(hdr, - ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize, - hdr); - - /* - * Normally the L2ARC can use the hdr's data, but if - * we're sharing data between the hdr and one of its - * bufs, L2ARC needs its own copy of the data so that - * the ZIO below can't race with the buf consumer. - * Another case where we need to create a copy of the - * data is when the buffer size is not device-aligned - * and we need to pad the block to make it such. - * That also keeps the clock hand suitably aligned. - * - * To ensure that the copy will be available for the - * lifetime of the ZIO and be cleaned up afterwards, we - * add it to the l2arc_free_on_write queue. - */ - abd_t *to_write; - if (!HDR_SHARED_DATA(hdr) && psize == asize) { - to_write = hdr->b_l1hdr.b_pabd; - } else { - to_write = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (asize != psize) { - abd_zero_off(to_write, psize, - asize - psize); - } - l2arc_free_abd_on_write(to_write, asize, - arc_buf_type(hdr)); - } - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, - ZIO_CHECKSUM_OFF, NULL, hdr, - ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - write_lsize += HDR_GET_LSIZE(hdr); - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - - write_psize += psize; - write_asize += asize; - dev->l2ad_hand += asize; - - mutex_exit(hash_lock); - - (void) zio_nowait(wzio); - } - - multilist_sublist_unlock(mls); - - if (full == B_TRUE) - break; - } - - /* No buffers selected for writing? */ - if (pio == NULL) { - ASSERT0(write_lsize); - ASSERT(!HDR_HAS_L1HDR(head)); - kmem_cache_free(hdr_l2only_cache, head); - return (0); - } - - ASSERT3U(write_psize, <=, target_sz); - ARCSTAT_BUMP(arcstat_l2_writes_sent); - ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); - vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); - - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } - - dev->l2ad_writing = B_TRUE; - (void) zio_wait(pio); - dev->l2ad_writing = B_FALSE; - - return (write_asize); -} - -/* - * This thread feeds the L2ARC at regular intervals. This is the beating - * heart of the L2ARC. - */ -/* ARGSUSED */ -static void -l2arc_feed_thread(void *unused __unused) -{ - callb_cpr_t cpr; - l2arc_dev_t *dev; - spa_t *spa; - uint64_t size, wrote; - clock_t begin, next = ddi_get_lbolt(); - - CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); - - mutex_enter(&l2arc_feed_thr_lock); - - while (l2arc_thread_exit == 0) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next - ddi_get_lbolt()); - CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); - next = ddi_get_lbolt() + hz; - - /* - * Quick check for L2ARC devices. - */ - mutex_enter(&l2arc_dev_mtx); - if (l2arc_ndev == 0) { - mutex_exit(&l2arc_dev_mtx); - continue; - } - mutex_exit(&l2arc_dev_mtx); - begin = ddi_get_lbolt(); - - /* - * This selects the next l2arc device to write to, and in - * doing so the next spa to feed from: dev->l2ad_spa. This - * will return NULL if there are now no l2arc devices or if - * they are all faulted. - * - * If a device is returned, its spa's config lock is also - * held to prevent device removal. l2arc_dev_get_next() - * will grab and release l2arc_dev_mtx. - */ - if ((dev = l2arc_dev_get_next()) == NULL) - continue; - - spa = dev->l2ad_spa; - ASSERT3P(spa, !=, NULL); - - /* - * If the pool is read-only then force the feed thread to - * sleep a little longer. - */ - if (!spa_writeable(spa)) { - next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; - spa_config_exit(spa, SCL_L2ARC, dev); - continue; - } - - /* - * Avoid contributing to memory pressure. - */ - if (arc_reclaim_needed()) { - ARCSTAT_BUMP(arcstat_l2_abort_lowmem); - spa_config_exit(spa, SCL_L2ARC, dev); - continue; - } - - ARCSTAT_BUMP(arcstat_l2_feeds); - - size = l2arc_write_size(); - - /* - * Evict L2ARC buffers that will be overwritten. - */ - l2arc_evict(dev, size, B_FALSE); - - /* - * Write ARC buffers. - */ - wrote = l2arc_write_buffers(spa, dev, size); - - /* - * Calculate interval between writes. - */ - next = l2arc_write_interval(begin, size, wrote); - spa_config_exit(spa, SCL_L2ARC, dev); - } - - l2arc_thread_exit = 0; - cv_broadcast(&l2arc_feed_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ - thread_exit(); -} - -boolean_t -l2arc_vdev_present(vdev_t *vd) -{ - l2arc_dev_t *dev; - - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev != NULL; - dev = list_next(l2arc_dev_list, dev)) { - if (dev->l2ad_vdev == vd) - break; - } - mutex_exit(&l2arc_dev_mtx); - - return (dev != NULL); -} - -/* - * Add a vdev for use by the L2ARC. By this point the spa has already - * validated the vdev and opened it. - */ -void -l2arc_add_vdev(spa_t *spa, vdev_t *vd) -{ - l2arc_dev_t *adddev; - - ASSERT(!l2arc_vdev_present(vd)); - - vdev_ashift_optimize(vd); - - /* - * Create a new l2arc device entry. - */ - adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); - adddev->l2ad_spa = spa; - adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; - adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); - adddev->l2ad_hand = adddev->l2ad_start; - adddev->l2ad_first = B_TRUE; - adddev->l2ad_writing = B_FALSE; - - mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); - /* - * This is a list of all ARC buffers that are still valid on the - * device. - */ - list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); - - vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); - zfs_refcount_create(&adddev->l2ad_alloc); - - /* - * Add device to global list - */ - mutex_enter(&l2arc_dev_mtx); - list_insert_head(l2arc_dev_list, adddev); - atomic_inc_64(&l2arc_ndev); - mutex_exit(&l2arc_dev_mtx); -} - -/* - * Remove a vdev from the L2ARC. - */ -void -l2arc_remove_vdev(vdev_t *vd) -{ - l2arc_dev_t *dev, *nextdev, *remdev = NULL; - - /* - * Find the device by vdev - */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } - ASSERT3P(remdev, !=, NULL); - - /* - * Remove device from global list - */ - list_remove(l2arc_dev_list, remdev); - l2arc_dev_last = NULL; /* may have been invalidated */ - atomic_dec_64(&l2arc_ndev); - mutex_exit(&l2arc_dev_mtx); - - /* - * Clear all buflists and ARC references. L2ARC device flush. - */ - l2arc_evict(remdev, 0, B_TRUE); - list_destroy(&remdev->l2ad_buflist); - mutex_destroy(&remdev->l2ad_mtx); - zfs_refcount_destroy(&remdev->l2ad_alloc); - kmem_free(remdev, sizeof (l2arc_dev_t)); -} - -void -l2arc_init(void) -{ - l2arc_thread_exit = 0; - l2arc_ndev = 0; - l2arc_writes_sent = 0; - l2arc_writes_done = 0; - - mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); - - l2arc_dev_list = &L2ARC_dev_list; - l2arc_free_on_write = &L2ARC_free_on_write; - list_create(l2arc_dev_list, sizeof (l2arc_dev_t), - offsetof(l2arc_dev_t, l2ad_node)); - list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), - offsetof(l2arc_data_free_t, l2df_list_node)); -} - -void -l2arc_fini(void) -{ - /* - * This is called from dmu_fini(), which is called from spa_fini(); - * Because of this, we can assume that all l2arc devices have - * already been removed when the pools themselves were removed. - */ - - l2arc_do_free_on_write(); - - mutex_destroy(&l2arc_feed_thr_lock); - cv_destroy(&l2arc_feed_thr_cv); - mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_free_on_write_mtx); - - list_destroy(l2arc_dev_list); - list_destroy(l2arc_free_on_write); -} - -void -l2arc_start(void) -{ - if (!(spa_mode_global & FWRITE)) - return; - - (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); -} - -void -l2arc_stop(void) -{ - if (!(spa_mode_global & FWRITE)) - return; - - mutex_enter(&l2arc_feed_thr_lock); - cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ - l2arc_thread_exit = 1; - while (l2arc_thread_exit != 0) - cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); - mutex_exit(&l2arc_feed_thr_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c deleted file mode 100644 index d7a7fdb0e1b1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/zio.h> -#include <sys/zio_compress.h> - -/* - * Embedded-data Block Pointers - * - * Normally, block pointers point (via their DVAs) to a block which holds data. - * If the data that we need to store is very small, this is an inefficient - * use of space, because a block must be at minimum 1 sector (typically 512 - * bytes or 4KB). Additionally, reading these small blocks tends to generate - * more random reads. - * - * Embedded-data Block Pointers allow small pieces of data (the "payload", - * up to 112 bytes) to be stored in the block pointer itself, instead of - * being pointed to. The "Pointer" part of this name is a bit of a - * misnomer, as nothing is pointed to. - * - * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to - * be embedded in the block pointer. The logic for this is handled in - * the SPA, by the zio pipeline. Therefore most code outside the zio - * pipeline doesn't need special-cases to handle these block pointers. - * - * See spa.h for details on the exact layout of embedded block pointers. - */ - -void -encode_embedded_bp_compressed(blkptr_t *bp, void *data, - enum zio_compress comp, int uncompressed_size, int compressed_size) -{ - uint64_t *bp64 = (uint64_t *)bp; - uint64_t w = 0; - uint8_t *data8 = data; - - ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); - ASSERT(uncompressed_size == compressed_size || - comp != ZIO_COMPRESS_OFF); - ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); - ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - - bzero(bp, sizeof (*bp)); - BP_SET_EMBEDDED(bp, B_TRUE); - BP_SET_COMPRESS(bp, comp); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - BPE_SET_LSIZE(bp, uncompressed_size); - BPE_SET_PSIZE(bp, compressed_size); - - /* - * Encode the byte array into the words of the block pointer. - * First byte goes into low bits of first word (little endian). - */ - for (int i = 0; i < compressed_size; i++) { - BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); - if (i % sizeof (w) == sizeof (w) - 1) { - /* we've reached the end of a word */ - ASSERT3P(bp64, <, bp + 1); - *bp64 = w; - bp64++; - if (!BPE_IS_PAYLOADWORD(bp, bp64)) - bp64++; - w = 0; - } - } - /* write last partial word */ - if (bp64 < (uint64_t *)(bp + 1)) - *bp64 = w; -} - -/* - * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be - * more than BPE_PAYLOAD_SIZE bytes). - */ -void -decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) -{ - int psize; - uint8_t *buf8 = buf; - uint64_t w = 0; - const uint64_t *bp64 = (const uint64_t *)bp; - - ASSERT(BP_IS_EMBEDDED(bp)); - - psize = BPE_GET_PSIZE(bp); - - /* - * Decode the words of the block pointer into the byte array. - * Low bits of first word are the first byte (little endian). - */ - for (int i = 0; i < psize; i++) { - if (i % sizeof (w) == 0) { - /* beginning of a word */ - ASSERT3P(bp64, <, bp + 1); - w = *bp64; - bp64++; - if (!BPE_IS_PAYLOADWORD(bp, bp64)) - bp64++; - } - buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); - } -} - -/* - * Fill in the buffer with the (decompressed) payload of the embedded - * blkptr_t. Takes into account compression and byteorder (the payload is - * treated as a stream of bytes). - * Return 0 on success, or ENOSPC if it won't fit in the buffer. - */ -int -decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) -{ - int lsize, psize; - - ASSERT(BP_IS_EMBEDDED(bp)); - - lsize = BPE_GET_LSIZE(bp); - psize = BPE_GET_PSIZE(bp); - - if (lsize > buflen) - return (ENOSPC); - ASSERT3U(lsize, ==, buflen); - - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { - uint8_t dstbuf[BPE_PAYLOAD_SIZE]; - decode_embedded_bp_compressed(bp, dstbuf); - VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), - dstbuf, buf, psize, buflen)); - } else { - ASSERT3U(lsize, ==, psize); - decode_embedded_bp_compressed(bp, buf); - } - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c deleted file mode 100644 index ee12db3a266d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include <sys/bplist.h> -#include <sys/zfs_context.h> - - -void -bplist_create(bplist_t *bpl) -{ - mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&bpl->bpl_list, sizeof (bplist_entry_t), - offsetof(bplist_entry_t, bpe_node)); -} - -void -bplist_destroy(bplist_t *bpl) -{ - list_destroy(&bpl->bpl_list); - mutex_destroy(&bpl->bpl_lock); -} - -void -bplist_append(bplist_t *bpl, const blkptr_t *bp) -{ - bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); - - mutex_enter(&bpl->bpl_lock); - bpe->bpe_blk = *bp; - list_insert_tail(&bpl->bpl_list, bpe); - mutex_exit(&bpl->bpl_lock); -} - -/* - * To aid debugging, we keep the most recently removed entry. This way if - * we are in the callback, we can easily locate the entry. - */ -static bplist_entry_t *bplist_iterate_last_removed; - -void -bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) -{ - bplist_entry_t *bpe; - - mutex_enter(&bpl->bpl_lock); - while (bpe = list_head(&bpl->bpl_list)) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); - mutex_exit(&bpl->bpl_lock); - func(arg, &bpe->bpe_blk, tx); - kmem_free(bpe, sizeof (*bpe)); - mutex_enter(&bpl->bpl_lock); - } - mutex_exit(&bpl->bpl_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c deleted file mode 100644 index bbdd765214fc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c +++ /dev/null @@ -1,606 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017 Datto Inc. - */ - -#include <sys/bpobj.h> -#include <sys/zfs_context.h> -#include <sys/refcount.h> -#include <sys/dsl_pool.h> -#include <sys/zfeature.h> -#include <sys/zap.h> - -/* - * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). - */ -uint64_t -bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(os); - dsl_pool_t *dp = dmu_objset_pool(os); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { - if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { - ASSERT0(dp->dp_empty_bpobj); - dp->dp_empty_bpobj = - bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY(zap_add(os, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, - &dp->dp_empty_bpobj, tx) == 0); - } - spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); - ASSERT(dp->dp_empty_bpobj != 0); - return (dp->dp_empty_bpobj); - } else { - return (bpobj_alloc(os, blocksize, tx)); - } -} - -void -bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_objset_pool(os); - - spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); - if (!spa_feature_is_active(dmu_objset_spa(os), - SPA_FEATURE_EMPTY_BPOBJ)) { - VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_EMPTY_BPOBJ, tx)); - VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); - dp->dp_empty_bpobj = 0; - } -} - -uint64_t -bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) -{ - int size; - - if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) - size = BPOBJ_SIZE_V0; - else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) - size = BPOBJ_SIZE_V1; - else - size = sizeof (bpobj_phys_t); - - return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, - DMU_OT_BPOBJ_HDR, size, tx)); -} - -void -bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) -{ - int64_t i; - bpobj_t bpo; - dmu_object_info_t doi; - int epb; - dmu_buf_t *dbuf = NULL; - - ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); - VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); - - mutex_enter(&bpo.bpo_lock); - - if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) - goto out; - - VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); - epb = doi.doi_data_block_size / sizeof (uint64_t); - - for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - VERIFY3U(0, ==, dmu_buf_hold(os, - bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - objarray = dbuf->db_data; - bpobj_free(os, objarray[blkoff], tx); - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); - -out: - mutex_exit(&bpo.bpo_lock); - bpobj_close(&bpo); - - VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); -} - -int -bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) -{ - dmu_object_info_t doi; - int err; - - err = dmu_object_info(os, object, &doi); - if (err) - return (err); - - bzero(bpo, sizeof (*bpo)); - mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); - - ASSERT(bpo->bpo_dbuf == NULL); - ASSERT(bpo->bpo_phys == NULL); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); - - err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); - if (err) - return (err); - - bpo->bpo_os = os; - bpo->bpo_object = object; - bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; - bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); - bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - bpo->bpo_phys = bpo->bpo_dbuf->db_data; - return (0); -} - -boolean_t -bpobj_is_open(const bpobj_t *bpo) -{ - return (bpo->bpo_object != 0); -} - -void -bpobj_close(bpobj_t *bpo) -{ - /* Lame workaround for closing a bpobj that was never opened. */ - if (bpo->bpo_object == 0) - return; - - dmu_buf_rele(bpo->bpo_dbuf, bpo); - if (bpo->bpo_cached_dbuf != NULL) - dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); - bpo->bpo_dbuf = NULL; - bpo->bpo_phys = NULL; - bpo->bpo_cached_dbuf = NULL; - bpo->bpo_object = 0; - - mutex_destroy(&bpo->bpo_lock); -} - -boolean_t -bpobj_is_empty(bpobj_t *bpo) -{ - return (bpo->bpo_phys->bpo_num_blkptrs == 0 && - (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); -} - -static int -bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, - boolean_t free) -{ - dmu_object_info_t doi; - int epb; - int64_t i; - int err = 0; - dmu_buf_t *dbuf = NULL; - - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - if (free) - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - - for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { - blkptr_t *bparray; - blkptr_t *bp; - uint64_t offset, blkoff; - - offset = i * sizeof (blkptr_t); - blkoff = P2PHASE(i, bpo->bpo_epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, - FTAG, &dbuf, 0); - if (err) - break; - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - bparray = dbuf->db_data; - bp = &bparray[blkoff]; - err = func(arg, bp, tx); - if (err) - break; - if (free) { - bpo->bpo_phys->bpo_bytes -= - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); - } - bpo->bpo_phys->bpo_num_blkptrs--; - ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); - } - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, - (i + 1) * sizeof (blkptr_t), -1ULL, tx)); - } - if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) - goto out; - - ASSERT(bpo->bpo_havecomp); - err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); - if (err) { - mutex_exit(&bpo->bpo_lock); - return (err); - } - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - epb = doi.doi_data_block_size / sizeof (uint64_t); - - for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - bpobj_t sublist; - uint64_t used_before, comp_before, uncomp_before; - uint64_t used_after, comp_after, uncomp_after; - - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); - if (err) - break; - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - objarray = dbuf->db_data; - err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); - if (err) - break; - if (free) { - err = bpobj_space(&sublist, - &used_before, &comp_before, &uncomp_before); - if (err != 0) { - bpobj_close(&sublist); - break; - } - } - err = bpobj_iterate_impl(&sublist, func, arg, tx, free); - if (free) { - VERIFY3U(0, ==, bpobj_space(&sublist, - &used_after, &comp_after, &uncomp_after)); - bpo->bpo_phys->bpo_bytes -= used_before - used_after; - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - bpo->bpo_phys->bpo_comp -= comp_before - comp_after; - bpo->bpo_phys->bpo_uncomp -= - uncomp_before - uncomp_after; - } - - bpobj_close(&sublist); - if (err) - break; - if (free) { - err = dmu_object_free(bpo->bpo_os, - objarray[blkoff], tx); - if (err) - break; - bpo->bpo_phys->bpo_num_subobjs--; - ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); - } - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - (i + 1) * sizeof (uint64_t), -1ULL, tx)); - } - -out: - /* If there are no entries, there should be no bytes. */ - if (bpobj_is_empty(bpo)) { - ASSERT0(bpo->bpo_phys->bpo_bytes); - ASSERT0(bpo->bpo_phys->bpo_comp); - ASSERT0(bpo->bpo_phys->bpo_uncomp); - } - - mutex_exit(&bpo->bpo_lock); - return (err); -} - -/* - * Iterate and remove the entries. If func returns nonzero, iteration - * will stop and that entry will not be removed. - */ -int -bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) -{ - return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); -} - -/* - * Iterate the entries. If func returns nonzero, iteration will stop. - */ -int -bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) -{ - return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); -} - -void -bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) -{ - bpobj_t subbpo; - uint64_t used, comp, uncomp, subsubobjs; - - ASSERT(bpobj_is_open(bpo)); - ASSERT(subobj != 0); - ASSERT(bpo->bpo_havesubobj); - ASSERT(bpo->bpo_havecomp); - ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); - - if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { - bpobj_decr_empty(bpo->bpo_os, tx); - return; - } - - VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - - if (bpobj_is_empty(&subbpo)) { - /* No point in having an empty subobj. */ - bpobj_close(&subbpo); - bpobj_free(bpo->bpo_os, subobj, tx); - return; - } - - mutex_enter(&bpo->bpo_lock); - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - if (bpo->bpo_phys->bpo_subobjs == 0) { - bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, - DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, - DMU_OT_NONE, 0, tx); - } - - dmu_object_info_t doi; - ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx); - bpo->bpo_phys->bpo_num_subobjs++; - - /* - * If subobj has only one block of subobjs, then move subobj's - * subobjs to bpo's subobj list directly. This reduces - * recursion in bpobj_iterate due to nested subobjs. - */ - subsubobjs = subbpo.bpo_phys->bpo_subobjs; - if (subsubobjs != 0) { - dmu_object_info_t doi; - - VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); - if (doi.doi_max_offset == doi.doi_data_block_size) { - dmu_buf_t *subdb; - uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; - - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, - 0, FTAG, &subdb, 0)); - /* - * Make sure that we are not asking dmu_write() - * to write more data than we have in our buffer. - */ - VERIFY3U(subdb->db_size, >=, - numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); - dmu_buf_rele(subdb, FTAG); - bpo->bpo_phys->bpo_num_subobjs += numsubsub; - - dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); - subbpo.bpo_phys->bpo_subobjs = 0; - VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, - subsubobjs, tx)); - } - } - bpo->bpo_phys->bpo_bytes += used; - bpo->bpo_phys->bpo_comp += comp; - bpo->bpo_phys->bpo_uncomp += uncomp; - mutex_exit(&bpo->bpo_lock); - - bpobj_close(&subbpo); -} - -void -bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) -{ - blkptr_t stored_bp = *bp; - uint64_t offset; - int blkoff; - blkptr_t *bparray; - - ASSERT(bpobj_is_open(bpo)); - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); - - if (BP_IS_EMBEDDED(bp)) { - /* - * The bpobj will compress better without the payload. - * - * Note that we store EMBEDDED bp's because they have an - * uncompressed size, which must be accounted for. An - * alternative would be to add their size to bpo_uncomp - * without storing the bp, but that would create additional - * complications: bpo_uncomp would be inconsistent with the - * set of BP's stored, and bpobj_iterate() wouldn't visit - * all the space accounted for in the bpobj. - */ - bzero(&stored_bp, sizeof (stored_bp)); - stored_bp.blk_prop = bp->blk_prop; - stored_bp.blk_birth = bp->blk_birth; - } else if (!BP_GET_DEDUP(bp)) { - /* The bpobj will compress better without the checksum */ - bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); - } - - /* We never need the fill count. */ - stored_bp.blk_fill = 0; - - mutex_enter(&bpo->bpo_lock); - - offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); - blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); - - if (bpo->bpo_cached_dbuf == NULL || - offset < bpo->bpo_cached_dbuf->db_offset || - offset >= bpo->bpo_cached_dbuf->db_offset + - bpo->bpo_cached_dbuf->db_size) { - if (bpo->bpo_cached_dbuf) - dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, bpo, &bpo->bpo_cached_dbuf, 0)); - } - - dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); - bparray = bpo->bpo_cached_dbuf->db_data; - bparray[blkoff] = stored_bp; - - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - bpo->bpo_phys->bpo_num_blkptrs++; - bpo->bpo_phys->bpo_bytes += - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpo->bpo_lock); -} - -struct space_range_arg { - spa_t *spa; - uint64_t mintxg; - uint64_t maxtxg; - uint64_t used; - uint64_t comp; - uint64_t uncomp; -}; - -/* ARGSUSED */ -static int -space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - struct space_range_arg *sra = arg; - - if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { - if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) - sra->used += bp_get_dsize_sync(sra->spa, bp); - else - sra->used += bp_get_dsize(sra->spa, bp); - sra->comp += BP_GET_PSIZE(bp); - sra->uncomp += BP_GET_UCSIZE(bp); - } - return (0); -} - -int -bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - *usedp = bpo->bpo_phys->bpo_bytes; - if (bpo->bpo_havecomp) { - *compp = bpo->bpo_phys->bpo_comp; - *uncompp = bpo->bpo_phys->bpo_uncomp; - mutex_exit(&bpo->bpo_lock); - return (0); - } else { - mutex_exit(&bpo->bpo_lock); - return (bpobj_space_range(bpo, 0, UINT64_MAX, - usedp, compp, uncompp)); - } -} - -/* - * Return the amount of space in the bpobj which is: - * mintxg < blk_birth <= maxtxg - */ -int -bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - struct space_range_arg sra = { 0 }; - int err; - - ASSERT(bpobj_is_open(bpo)); - - /* - * As an optimization, if they want the whole txg range, just - * get bpo_bytes rather than iterating over the bps. - */ - if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) - return (bpobj_space(bpo, usedp, compp, uncompp)); - - sra.spa = dmu_objset_spa(bpo->bpo_os); - sra.mintxg = mintxg; - sra.maxtxg = maxtxg; - - err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); - *usedp = sra.used; - *compp = sra.comp; - *uncompp = sra.uncomp; - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c deleted file mode 100644 index c74d07236c1b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include <sys/arc.h> -#include <sys/bptree.h> -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_traverse.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_pool.h> -#include <sys/dnode.h> -#include <sys/refcount.h> -#include <sys/spa.h> - -/* - * A bptree is a queue of root block pointers from destroyed datasets. When a - * dataset is destroyed its root block pointer is put on the end of the pool's - * bptree queue so the dataset's blocks can be freed asynchronously by - * dsl_scan_sync. This allows the delete operation to finish without traversing - * all the dataset's blocks. - * - * Note that while bt_begin and bt_end are only ever incremented in this code, - * they are effectively reset to 0 every time the entire bptree is freed because - * the bptree's object is destroyed and re-created. - */ - -struct bptree_args { - bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ - boolean_t ba_free; /* true if freeing during traversal */ - - bptree_itor_t *ba_func; /* function to call for each blockpointer */ - void *ba_arg; /* caller supplied argument to ba_func */ - dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ -} bptree_args_t; - -uint64_t -bptree_alloc(objset_t *os, dmu_tx_t *tx) -{ - uint64_t obj; - dmu_buf_t *db; - bptree_phys_t *bt; - - obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, - SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, - sizeof (bptree_phys_t), tx); - - /* - * Bonus buffer contents are already initialized to 0, but for - * readability we make it explicit. - */ - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - bt = db->db_data; - bt->bt_begin = 0; - bt->bt_end = 0; - bt->bt_bytes = 0; - bt->bt_comp = 0; - bt->bt_uncomp = 0; - dmu_buf_rele(db, FTAG); - - return (obj); -} - -int -bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - ASSERT3U(bt->bt_begin, ==, bt->bt_end); - ASSERT0(bt->bt_bytes); - ASSERT0(bt->bt_comp); - ASSERT0(bt->bt_uncomp); - dmu_buf_rele(db, FTAG); - - return (dmu_object_free(os, obj, tx)); -} - -boolean_t -bptree_is_empty(objset_t *os, uint64_t obj) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - boolean_t rv; - - VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - rv = (bt->bt_begin == bt->bt_end); - dmu_buf_rele(db, FTAG); - return (rv); -} - -void -bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, - uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - bptree_entry_phys_t bte = { 0 }; - - /* - * bptree objects are in the pool mos, therefore they can only be - * modified in syncing context. Furthermore, this is only modified - * by the sync thread, so no locking is necessary. - */ - ASSERT(dmu_tx_is_syncing(tx)); - - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - - bte.be_birth_txg = birth_txg; - bte.be_bp = *bp; - dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); - - dmu_buf_will_dirty(db, tx); - bt->bt_end++; - bt->bt_bytes += bytes; - bt->bt_comp += comp; - bt->bt_uncomp += uncomp; - dmu_buf_rele(db, FTAG); -} - -/* ARGSUSED */ -static int -bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - int err; - struct bptree_args *ba = arg; - - if (bp == NULL || BP_IS_HOLE(bp)) - return (0); - - err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); - if (err == 0 && ba->ba_free) { - ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); - ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); - ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); - } - return (err); -} - -/* - * If "free" is set: - * - It is assumed that "func" will be freeing the block pointers. - * - If "func" returns nonzero, the bookmark will be remembered and - * iteration will be restarted from this point on next invocation. - * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), - * bptree_iterate will remember the bookmark, continue traversing - * any additional entries, and return 0. - * - * If "free" is not set, traversal will stop and return an error if - * an i/o error is encountered. - * - * In either case, if zfs_free_leak_on_eio is set, i/o errors will be - * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to - * traverse_dataset_destroyed()). - */ -int -bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - void *arg, dmu_tx_t *tx) -{ - boolean_t ioerr = B_FALSE; - int err; - uint64_t i; - dmu_buf_t *db; - struct bptree_args ba; - - ASSERT(!free || dmu_tx_is_syncing(tx)); - - err = dmu_bonus_hold(os, obj, FTAG, &db); - if (err != 0) - return (err); - - if (free) - dmu_buf_will_dirty(db, tx); - - ba.ba_phys = db->db_data; - ba.ba_free = free; - ba.ba_func = func; - ba.ba_arg = arg; - ba.ba_tx = tx; - - err = 0; - for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { - bptree_entry_phys_t bte; - int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), - &bte, DMU_READ_NO_PREFETCH); - if (err != 0) - break; - - if (zfs_free_leak_on_eio) - flags |= TRAVERSE_HARD; - zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " - "bookmark %lld/%lld/%lld/%lld", - (longlong_t)i, - (longlong_t)bte.be_birth_txg, - (longlong_t)bte.be_zb.zb_objset, - (longlong_t)bte.be_zb.zb_object, - (longlong_t)bte.be_zb.zb_level, - (longlong_t)bte.be_zb.zb_blkid); - err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, - bte.be_birth_txg, &bte.be_zb, flags, - bptree_visit_cb, &ba); - if (free) { - /* - * The callback has freed the visited block pointers. - * Record our traversal progress on disk, either by - * updating this record's bookmark, or by logically - * removing this record by advancing bt_begin. - */ - if (err != 0) { - /* save bookmark for future resume */ - ASSERT3U(bte.be_zb.zb_objset, ==, - ZB_DESTROYED_OBJSET); - ASSERT0(bte.be_zb.zb_level); - dmu_write(os, obj, i * sizeof (bte), - sizeof (bte), &bte, tx); - if (err == EIO || err == ECKSUM || - err == ENXIO) { - /* - * Skip the rest of this tree and - * continue on to the next entry. - */ - err = 0; - ioerr = B_TRUE; - } else { - break; - } - } else if (ioerr) { - /* - * This entry is finished, but there were - * i/o errors on previous entries, so we - * can't adjust bt_begin. Set this entry's - * be_birth_txg such that it will be - * treated as a no-op in future traversals. - */ - bte.be_birth_txg = UINT64_MAX; - dmu_write(os, obj, i * sizeof (bte), - sizeof (bte), &bte, tx); - } - - if (!ioerr) { - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); - } - } else if (err != 0) { - break; - } - } - - ASSERT(!free || err != 0 || ioerr || - ba.ba_phys->bt_begin == ba.ba_phys->bt_end); - - /* if all blocks are free there should be no used space */ - if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { - if (zfs_free_leak_on_eio) { - ba.ba_phys->bt_bytes = 0; - ba.ba_phys->bt_comp = 0; - ba.ba_phys->bt_uncomp = 0; - } - - ASSERT0(ba.ba_phys->bt_bytes); - ASSERT0(ba.ba_phys->bt_comp); - ASSERT0(ba.ba_phys->bt_uncomp); - } - - dmu_buf_rele(db, FTAG); - - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c deleted file mode 100644 index 1ddc697b5424..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#include <sys/bqueue.h> -#include <sys/zfs_context.h> - -static inline bqueue_node_t * -obj2node(bqueue_t *q, void *data) -{ - return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); -} - -/* - * Initialize a blocking queue The maximum capacity of the queue is set to - * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, - * and offset should give its offset from the start of the struct. Return 0 on - * success, or -1 on failure. - */ -int -bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) -{ - list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), - node_offset + offsetof(bqueue_node_t, bqn_node)); - cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); - cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); - q->bq_node_offset = node_offset; - q->bq_size = 0; - q->bq_maxsize = size; - return (0); -} - -/* - * Destroy a blocking queue. This function asserts that there are no - * elements in the queue, and no one is blocked on the condition - * variables. - */ -void -bqueue_destroy(bqueue_t *q) -{ - ASSERT0(q->bq_size); - cv_destroy(&q->bq_add_cv); - cv_destroy(&q->bq_pop_cv); - mutex_destroy(&q->bq_lock); - list_destroy(&q->bq_list); -} - -/* - * Add data to q, consuming size units of capacity. If there is insufficient - * capacity to consume size units, block until capacity exists. Asserts size is - * > 0. - */ -void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) -{ - ASSERT3U(item_size, >, 0); - ASSERT3U(item_size, <, q->bq_maxsize); - mutex_enter(&q->bq_lock); - obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { - cv_wait(&q->bq_add_cv, &q->bq_lock); - } - q->bq_size += item_size; - list_insert_tail(&q->bq_list, data); - cv_signal(&q->bq_pop_cv); - mutex_exit(&q->bq_lock); -} -/* - * Take the first element off of q. If there are no elements on the queue, wait - * until one is put there. Return the removed element. - */ -void * -bqueue_dequeue(bqueue_t *q) -{ - void *ret; - uint64_t item_size; - mutex_enter(&q->bq_lock); - while (q->bq_size == 0) { - cv_wait(&q->bq_pop_cv, &q->bq_lock); - } - ret = list_remove_head(&q->bq_list); - item_size = obj2node(q, ret)->bqn_size; - q->bq_size -= item_size; - mutex_exit(&q->bq_lock); - cv_signal(&q->bq_add_cv); - return (ret); -} - -/* - * Returns true if the space used is 0. - */ -boolean_t -bqueue_empty(bqueue_t *q) -{ - return (q->bq_size == 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c deleted file mode 100644 index 2b62edad0342..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2011 Google, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#include <sys/cityhash.h> - -#define HASH_K1 0xb492b66fbe98f273ULL -#define HASH_K2 0x9ae16a3b2f90404fULL - -/* - * Bitwise right rotate. Normally this will compile to a single - * instruction. - */ -static inline uint64_t -rotate(uint64_t val, int shift) -{ - // Avoid shifting by 64: doing so yields an undefined result. - return (shift == 0 ? val : (val >> shift) | (val << (64 - shift))); -} - -static inline uint64_t -cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) -{ - uint64_t a = (u ^ v) * mul; - a ^= (a >> 47); - uint64_t b = (v ^ a) * mul; - b ^= (b >> 47); - b *= mul; - return (b); -} - -uint64_t -cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) -{ - uint64_t mul = HASH_K2 + 64; - uint64_t a = w1 * HASH_K1; - uint64_t b = w2; - uint64_t c = w4 * mul; - uint64_t d = w3 * HASH_K2; - return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, - a + rotate(b + HASH_K2, 18) + c, mul)); - -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c deleted file mode 100644 index 1974ff2197c2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ /dev/null @@ -1,4248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include <sys/zfs_context.h> -#include <sys/dmu.h> -#include <sys/dmu_send.h> -#include <sys/dmu_impl.h> -#include <sys/dbuf.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_dir.h> -#include <sys/dmu_tx.h> -#include <sys/spa.h> -#include <sys/zio.h> -#include <sys/dmu_zfetch.h> -#include <sys/sa.h> -#include <sys/sa_impl.h> -#include <sys/zfeature.h> -#include <sys/blkptr.h> -#include <sys/range_tree.h> -#include <sys/callb.h> -#include <sys/abd.h> -#include <sys/vdev.h> -#include <sys/cityhash.h> -#include <sys/spa_impl.h> - -kstat_t *dbuf_ksp; - -typedef struct dbuf_stats { - /* - * Various statistics about the size of the dbuf cache. - */ - kstat_named_t cache_count; - kstat_named_t cache_size_bytes; - kstat_named_t cache_size_bytes_max; - /* - * Statistics regarding the bounds on the dbuf cache size. - */ - kstat_named_t cache_target_bytes; - kstat_named_t cache_lowater_bytes; - kstat_named_t cache_hiwater_bytes; - /* - * Total number of dbuf cache evictions that have occurred. - */ - kstat_named_t cache_total_evicts; - /* - * The distribution of dbuf levels in the dbuf cache and - * the total size of all dbufs at each level. - */ - kstat_named_t cache_levels[DN_MAX_LEVELS]; - kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; - /* - * Statistics about the dbuf hash table. - */ - kstat_named_t hash_hits; - kstat_named_t hash_misses; - kstat_named_t hash_collisions; - kstat_named_t hash_elements; - kstat_named_t hash_elements_max; - /* - * Number of sublists containing more than one dbuf in the dbuf - * hash table. Keep track of the longest hash chain. - */ - kstat_named_t hash_chains; - kstat_named_t hash_chain_max; - /* - * Number of times a dbuf_create() discovers that a dbuf was - * already created and in the dbuf hash table. - */ - kstat_named_t hash_insert_race; - /* - * Statistics about the size of the metadata dbuf cache. - */ - kstat_named_t metadata_cache_count; - kstat_named_t metadata_cache_size_bytes; - kstat_named_t metadata_cache_size_bytes_max; - /* - * For diagnostic purposes, this is incremented whenever we can't add - * something to the metadata cache because it's full, and instead put - * the data in the regular dbuf cache. - */ - kstat_named_t metadata_cache_overflow; -} dbuf_stats_t; - -dbuf_stats_t dbuf_stats = { - { "cache_count", KSTAT_DATA_UINT64 }, - { "cache_size_bytes", KSTAT_DATA_UINT64 }, - { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, - { "cache_target_bytes", KSTAT_DATA_UINT64 }, - { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, - { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, - { "cache_total_evicts", KSTAT_DATA_UINT64 }, - { { "cache_levels_N", KSTAT_DATA_UINT64 } }, - { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, - { "hash_hits", KSTAT_DATA_UINT64 }, - { "hash_misses", KSTAT_DATA_UINT64 }, - { "hash_collisions", KSTAT_DATA_UINT64 }, - { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, - { "hash_chains", KSTAT_DATA_UINT64 }, - { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "hash_insert_race", KSTAT_DATA_UINT64 }, - { "metadata_cache_count", KSTAT_DATA_UINT64 }, - { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, - { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, - { "metadata_cache_overflow", KSTAT_DATA_UINT64 } -}; - -#define DBUF_STAT_INCR(stat, val) \ - atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); -#define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); -#define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); -#define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); -#define DBUF_STAT_MAX(stat, v) { \ - uint64_t _m; \ - while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ - (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ - continue; \ -} - -struct dbuf_hold_impl_data { - /* Function arguments */ - dnode_t *dh_dn; - uint8_t dh_level; - uint64_t dh_blkid; - boolean_t dh_fail_sparse; - boolean_t dh_fail_uncached; - void *dh_tag; - dmu_buf_impl_t **dh_dbp; - /* Local variables */ - dmu_buf_impl_t *dh_db; - dmu_buf_impl_t *dh_parent; - blkptr_t *dh_bp; - int dh_err; - dbuf_dirty_record_t *dh_dr; - int dh_depth; -}; - -static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, - boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp, int depth); -static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); - -static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); - -/* - * Global data structures and functions for the dbuf cache. - */ -static kmem_cache_t *dbuf_kmem_cache; -static taskq_t *dbu_evict_taskq; - -static kthread_t *dbuf_cache_evict_thread; -static kmutex_t dbuf_evict_lock; -static kcondvar_t dbuf_evict_cv; -static boolean_t dbuf_evict_thread_exit; - -/* - * There are two dbuf caches; each dbuf can only be in one of them at a time. - * - * 1. Cache of metadata dbufs, to help make read-heavy administrative commands - * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs - * that represent the metadata that describes filesystems/snapshots/ - * bookmarks/properties/etc. We only evict from this cache when we export a - * pool, to short-circuit as much I/O as possible for all administrative - * commands that need the metadata. There is no eviction policy for this - * cache, because we try to only include types in it which would occupy a - * very small amount of space per object but create a large impact on the - * performance of these commands. Instead, after it reaches a maximum size - * (which should only happen on very small memory systems with a very large - * number of filesystem objects), we stop taking new dbufs into the - * metadata cache, instead putting them in the normal dbuf cache. - * - * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that - * are not currently held but have been recently released. These dbufs - * are not eligible for arc eviction until they are aged out of the cache. - * Dbufs that are aged out of the cache will be immediately destroyed and - * become eligible for arc eviction. - * - * Dbufs are added to these caches once the last hold is released. If a dbuf is - * later accessed and still exists in the dbuf cache, then it will be removed - * from the cache and later re-added to the head of the cache. - * - * If a given dbuf meets the requirements for the metadata cache, it will go - * there, otherwise it will be considered for the generic LRU dbuf cache. The - * caches and the refcounts tracking their sizes are stored in an array indexed - * by those caches' matching enum values (from dbuf_cached_state_t). - */ -typedef struct dbuf_cache { - multilist_t *cache; - zfs_refcount_t size; -} dbuf_cache_t; -dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; - -/* Size limits for the caches */ -uint64_t dbuf_cache_max_bytes = 0; -uint64_t dbuf_metadata_cache_max_bytes = 0; -/* Set the default sizes of the caches to log2 fraction of arc size */ -int dbuf_cache_shift = 5; -int dbuf_metadata_cache_shift = 6; - -/* - * For diagnostic purposes, this is incremented whenever we can't add - * something to the metadata cache because it's full, and instead put - * the data in the regular dbuf cache. - */ -uint64_t dbuf_metadata_cache_overflow; - -/* - * The LRU dbuf cache uses a three-stage eviction policy: - * - A low water marker designates when the dbuf eviction thread - * should stop evicting from the dbuf cache. - * - When we reach the maximum size (aka mid water mark), we - * signal the eviction thread to run. - * - The high water mark indicates when the eviction thread - * is unable to keep up with the incoming load and eviction must - * happen in the context of the calling thread. - * - * The dbuf cache: - * (max size) - * low water mid water hi water - * +----------------------------------------+----------+----------+ - * | | | | - * | | | | - * | | | | - * | | | | - * +----------------------------------------+----------+----------+ - * stop signal evict - * evicting eviction directly - * thread - * - * The high and low water marks indicate the operating range for the eviction - * thread. The low water mark is, by default, 90% of the total size of the - * cache and the high water mark is at 110% (both of these percentages can be - * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, - * respectively). The eviction thread will try to ensure that the cache remains - * within this range by waking up every second and checking if the cache is - * above the low water mark. The thread can also be woken up by callers adding - * elements into the cache if the cache is larger than the mid water (i.e max - * cache size). Once the eviction thread is woken up and eviction is required, - * it will continue evicting buffers until it's able to reduce the cache size - * to the low water mark. If the cache size continues to grow and hits the high - * water mark, then callers adding elments to the cache will begin to evict - * directly from the cache until the cache is no longer above the high water - * mark. - */ - -/* - * The percentage above and below the maximum cache size. - */ -uint_t dbuf_cache_hiwater_pct = 10; -uint_t dbuf_cache_lowater_pct = 10; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, - &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, - &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, - &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, - &dbuf_metadata_cache_shift, 0, - "dbuf metadata cache size as log2 fraction of ARC"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, - &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, - &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, - &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); - -/* ARGSUSED */ -static int -dbuf_cons(void *vdb, void *unused, int kmflag) -{ - dmu_buf_impl_t *db = vdb; - bzero(db, sizeof (dmu_buf_impl_t)); - - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); - multilist_link_init(&db->db_cache_link); - zfs_refcount_create(&db->db_holds); - - return (0); -} - -/* ARGSUSED */ -static void -dbuf_dest(void *vdb, void *unused) -{ - dmu_buf_impl_t *db = vdb; - mutex_destroy(&db->db_mtx); - cv_destroy(&db->db_changed); - ASSERT(!multilist_link_active(&db->db_cache_link)); - zfs_refcount_destroy(&db->db_holds); -} - -/* - * dbuf hash table routines - */ -static dbuf_hash_table_t dbuf_hash_table; - -static uint64_t dbuf_hash_count; - -/* - * We use Cityhash for this. It's fast, and has good hash properties without - * requiring any large static buffers. - */ -static uint64_t -dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) -{ - return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); -} - -#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ - ((dbuf)->db.db_object == (obj) && \ - (dbuf)->db_objset == (os) && \ - (dbuf)->db_level == (level) && \ - (dbuf)->db_blkid == (blkid)) - -dmu_buf_impl_t * -dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = dbuf_hash(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *db; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { - if (DBUF_EQUAL(db, os, obj, level, blkid)) { - mutex_enter(&db->db_mtx); - if (db->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (db); - } - mutex_exit(&db->db_mtx); - } - } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (NULL); -} - -static dmu_buf_impl_t * -dbuf_find_bonus(objset_t *os, uint64_t object) -{ - dnode_t *dn; - dmu_buf_impl_t *db = NULL; - - if (dnode_hold(os, object, FTAG, &dn) == 0) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_bonus != NULL) { - db = dn->dn_bonus; - mutex_enter(&db->db_mtx); - } - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - } - return (db); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - */ -static dmu_buf_impl_t * -dbuf_hash_insert(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - objset_t *os = db->db_objset; - uint64_t obj = db->db.db_object; - int level = db->db_level; - uint64_t blkid, hv, idx; - dmu_buf_impl_t *dbf; - uint32_t i; - - blkid = db->db_blkid; - hv = dbuf_hash(os, obj, level, blkid); - idx = hv & h->hash_table_mask; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (dbf = h->hash_table[idx], i = 0; dbf != NULL; - dbf = dbf->db_hash_next, i++) { - if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { - mutex_enter(&dbf->db_mtx); - if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (dbf); - } - mutex_exit(&dbf->db_mtx); - } - } - - if (i > 0) { - DBUF_STAT_BUMP(hash_collisions); - if (i == 1) - DBUF_STAT_BUMP(hash_chains); - - DBUF_STAT_MAX(hash_chain_max, i); - } - - mutex_enter(&db->db_mtx); - db->db_hash_next = h->hash_table[idx]; - h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_inc_64(&dbuf_hash_count); - DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); - - return (NULL); -} - -/* - * Remove an entry from the hash table. It must be in the EVICTING state. - */ -static void -dbuf_hash_remove(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv, idx; - dmu_buf_impl_t *dbf, **dbp; - - hv = dbuf_hash(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid); - idx = hv & h->hash_table_mask; - - /* - * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_MUTEX > db_mtx. - */ - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_state == DB_EVICTING); - ASSERT(!MUTEX_HELD(&db->db_mtx)); - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - dbp = &h->hash_table[idx]; - while ((dbf = *dbp) != db) { - dbp = &dbf->db_hash_next; - ASSERT(dbf != NULL); - } - *dbp = db->db_hash_next; - db->db_hash_next = NULL; - if (h->hash_table[idx] && - h->hash_table[idx]->db_hash_next == NULL) - DBUF_STAT_BUMPDOWN(hash_chains); - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_hash_count); -} - -typedef enum { - DBVU_EVICTING, - DBVU_NOT_EVICTING -} dbvu_verify_type_t; - -static void -dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) -{ -#ifdef ZFS_DEBUG - int64_t holds; - - if (db->db_user == NULL) - return; - - /* Only data blocks support the attachment of user data. */ - ASSERT(db->db_level == 0); - - /* Clients must resolve a dbuf before attaching user data. */ - ASSERT(db->db.db_data != NULL); - ASSERT3U(db->db_state, ==, DB_CACHED); - - holds = zfs_refcount_count(&db->db_holds); - if (verify_type == DBVU_EVICTING) { - /* - * Immediate eviction occurs when holds == dirtycnt. - * For normal eviction buffers, holds is zero on - * eviction, except when dbuf_fix_old_data() calls - * dbuf_clear_data(). However, the hold count can grow - * during eviction even though db_mtx is held (see - * dmu_bonus_hold() for an example), so we can only - * test the generic invariant that holds >= dirtycnt. - */ - ASSERT3U(holds, >=, db->db_dirtycnt); - } else { - if (db->db_user_immediate_evict == TRUE) - ASSERT3U(holds, >=, db->db_dirtycnt); - else - ASSERT3U(holds, >, 0); - } -#endif -} - -static void -dbuf_evict_user(dmu_buf_impl_t *db) -{ - dmu_buf_user_t *dbu = db->db_user; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (dbu == NULL) - return; - - dbuf_verify_user(db, DBVU_EVICTING); - db->db_user = NULL; - -#ifdef ZFS_DEBUG - if (dbu->dbu_clear_on_evict_dbufp != NULL) - *dbu->dbu_clear_on_evict_dbufp = NULL; -#endif - - /* - * There are two eviction callbacks - one that we call synchronously - * and one that we invoke via a taskq. The async one is useful for - * avoiding lock order reversals and limiting stack depth. - * - * Note that if we have a sync callback but no async callback, - * it's likely that the sync callback will free the structure - * containing the dbu. In that case we need to take care to not - * dereference dbu after calling the sync evict func. - */ - boolean_t has_async = (dbu->dbu_evict_func_async != NULL); - - if (dbu->dbu_evict_func_sync != NULL) - dbu->dbu_evict_func_sync(dbu); - - if (has_async) { - taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, - dbu, 0, &dbu->dbu_tqent); - } -} - -boolean_t -dbuf_is_metadata(dmu_buf_impl_t *db) -{ - if (db->db_level > 0) { - return (B_TRUE); - } else { - boolean_t is_metadata; - - DB_DNODE_ENTER(db); - is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); - DB_DNODE_EXIT(db); - - return (is_metadata); - } -} - -/* - * This returns whether this dbuf should be stored in the metadata cache, which - * is based on whether it's from one of the dnode types that store data related - * to traversing dataset hierarchies. - */ -static boolean_t -dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) -{ - DB_DNODE_ENTER(db); - dmu_object_type_t type = DB_DNODE(db)->dn_type; - DB_DNODE_EXIT(db); - - /* Check if this dbuf is one of the types we care about */ - if (DMU_OT_IS_METADATA_CACHED(type)) { - /* If we hit this, then we set something up wrong in dmu_ot */ - ASSERT(DMU_OT_IS_METADATA(type)); - - /* - * Sanity check for small-memory systems: don't allocate too - * much memory for this purpose. - */ - if (zfs_refcount_count( - &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > - dbuf_metadata_cache_max_bytes) { - dbuf_metadata_cache_overflow++; - DTRACE_PROBE1(dbuf__metadata__cache__overflow, - dmu_buf_impl_t *, db); - return (B_FALSE); - } - - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * This function *must* return indices evenly distributed between all - * sublists of the multilist. This is needed due to how the dbuf eviction - * code is laid out; dbuf_evict_thread() assumes dbufs are evenly - * distributed between all sublists and uses this assumption when - * deciding which sublist to evict from and how much to evict from it. - */ -unsigned int -dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) -{ - dmu_buf_impl_t *db = obj; - - /* - * The assumption here, is the hash value for a given - * dmu_buf_impl_t will remain constant throughout it's lifetime - * (i.e. it's objset, object, level and blkid fields don't change). - * Thus, we don't need to store the dbuf's sublist index - * on insertion, as this index can be recalculated on removal. - * - * Also, the low order bits of the hash value are thought to be - * distributed evenly. Otherwise, in the case that the multilist - * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. - */ - return (dbuf_hash(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid) % - multilist_get_num_sublists(ml)); -} - -static inline unsigned long -dbuf_cache_target_bytes(void) -{ - return MIN(dbuf_cache_max_bytes, - arc_max_bytes() >> dbuf_cache_shift); -} - -static inline uint64_t -dbuf_cache_hiwater_bytes(void) -{ - uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); - return (dbuf_cache_target + - (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); -} - -static inline uint64_t -dbuf_cache_lowater_bytes(void) -{ - uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); - return (dbuf_cache_target - - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); -} - -static inline boolean_t -dbuf_cache_above_lowater(void) -{ - return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > - dbuf_cache_lowater_bytes()); -} - -/* - * Evict the oldest eligible dbuf from the dbuf cache. - */ -static void -dbuf_evict_one(void) -{ - int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( - dbuf_caches[DB_DBUF_CACHE].cache, idx); - - ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); - - dmu_buf_impl_t *db = multilist_sublist_tail(mls); - while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { - db = multilist_sublist_prev(mls, db); - } - - DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, - multilist_sublist_t *, mls); - - if (db != NULL) { - multilist_sublist_remove(mls, db); - multilist_sublist_unlock(mls); - (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, - db->db.db_size, db); - DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); - ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); - db->db_caching_status = DB_NO_CACHE; - dbuf_destroy(db); - DBUF_STAT_BUMP(cache_total_evicts); - } else { - multilist_sublist_unlock(mls); - } -} - -/* - * The dbuf evict thread is responsible for aging out dbufs from the - * cache. Once the cache has reached it's maximum size, dbufs are removed - * and destroyed. The eviction thread will continue running until the size - * of the dbuf cache is at or below the maximum size. Once the dbuf is aged - * out of the cache it is destroyed and becomes eligible for arc eviction. - */ -/* ARGSUSED */ -static void -dbuf_evict_thread(void *unused __unused) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); - - mutex_enter(&dbuf_evict_lock); - while (!dbuf_evict_thread_exit) { - while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_hires(&dbuf_evict_cv, - &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); - CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); -#ifdef __FreeBSD__ - if (dbuf_ksp != NULL) - dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ); -#endif - } - mutex_exit(&dbuf_evict_lock); - - /* - * Keep evicting as long as we're above the low water mark - * for the cache. We do this without holding the locks to - * minimize lock contention. - */ - while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - dbuf_evict_one(); - } - - mutex_enter(&dbuf_evict_lock); - } - - dbuf_evict_thread_exit = B_FALSE; - cv_broadcast(&dbuf_evict_cv); - CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ - thread_exit(); -} - -/* - * Wake up the dbuf eviction thread if the dbuf cache is at its max size. - * If the dbuf cache is at its high water mark, then evict a dbuf from the - * dbuf cache using the callers context. - */ -static void -dbuf_evict_notify(uint64_t size) -{ - /* - * We check if we should evict without holding the dbuf_evict_lock, - * because it's OK to occasionally make the wrong decision here, - * and grabbing the lock results in massive lock contention. - */ - if (size > dbuf_cache_max_bytes) { - if (size > dbuf_cache_hiwater_bytes()) - dbuf_evict_one(); - cv_signal(&dbuf_evict_cv); - } -} - -static int -dbuf_kstat_update(kstat_t *ksp, int rw) -{ - dbuf_stats_t *ds = ksp->ks_data; - - if (rw == KSTAT_WRITE) { - return (SET_ERROR(EACCES)); - } else { - ds->metadata_cache_size_bytes.value.ui64 = - zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size); - ds->cache_size_bytes.value.ui64 = - zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); - ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); - ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); - ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); - ds->hash_elements.value.ui64 = dbuf_hash_count; - } - - return (0); -} - -void -dbuf_init(void) -{ - uint64_t hsize = 1ULL << 16; - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - /* - * The hash table is big enough to fill all of physical memory - * with an average 4K block size. The table will take up - * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). - */ - while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; - -retry: - h->hash_table_mask = hsize - 1; - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; - } - - dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", - sizeof (dmu_buf_impl_t), - 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); - - dbuf_stats_init(h); - /* - * Setup the parameters for the dbuf caches. We set the sizes of the - * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) - * of the size of the ARC, respectively. If the values are set in - * /etc/system and they're not greater than the size of the ARC, then - * we honor that value. - */ - if (dbuf_cache_max_bytes == 0 || - dbuf_cache_max_bytes >= arc_max_bytes()) { - dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; - } - if (dbuf_metadata_cache_max_bytes == 0 || - dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { - dbuf_metadata_cache_max_bytes = - arc_max_bytes() >> dbuf_metadata_cache_shift; - } - - /* - * All entries are queued via taskq_dispatch_ent(), so min/maxalloc - * configuration is not required. - */ - dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); - - for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - dbuf_caches[dcs].cache = - multilist_create(sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_cache_link), - dbuf_cache_multilist_index_func); - zfs_refcount_create(&dbuf_caches[dcs].size); - } - - dbuf_evict_thread_exit = B_FALSE; - mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); - dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, - NULL, 0, &p0, TS_RUN, minclsyspri); - - dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", - KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (dbuf_ksp != NULL) { - for (i = 0; i < DN_MAX_LEVELS; i++) { - snprintf(dbuf_stats.cache_levels[i].name, - KSTAT_STRLEN, "cache_level_%d", i); - dbuf_stats.cache_levels[i].data_type = - KSTAT_DATA_UINT64; - snprintf(dbuf_stats.cache_levels_bytes[i].name, - KSTAT_STRLEN, "cache_level_%d_bytes", i); - dbuf_stats.cache_levels_bytes[i].data_type = - KSTAT_DATA_UINT64; - } - dbuf_ksp->ks_data = &dbuf_stats; - dbuf_ksp->ks_update = dbuf_kstat_update; - kstat_install(dbuf_ksp); - } -} - -void -dbuf_fini(void) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - dbuf_stats_destroy(); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(&h->hash_mutexes[i]); - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_kmem_cache); - taskq_destroy(dbu_evict_taskq); - - mutex_enter(&dbuf_evict_lock); - dbuf_evict_thread_exit = B_TRUE; - while (dbuf_evict_thread_exit) { - cv_signal(&dbuf_evict_cv); - cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); - } - mutex_exit(&dbuf_evict_lock); - - mutex_destroy(&dbuf_evict_lock); - cv_destroy(&dbuf_evict_cv); - - for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - zfs_refcount_destroy(&dbuf_caches[dcs].size); - multilist_destroy(dbuf_caches[dcs].cache); - } - - if (dbuf_ksp != NULL) { - kstat_delete(dbuf_ksp); - dbuf_ksp = NULL; - } -} - -/* - * Other stuff. - */ - -#ifdef ZFS_DEBUG -static void -dbuf_verify(dmu_buf_impl_t *db) -{ - dnode_t *dn; - dbuf_dirty_record_t *dr; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) - return; - - ASSERT(db->db_objset != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn == NULL) { - ASSERT(db->db_parent == NULL); - ASSERT(db->db_blkptr == NULL); - } else { - ASSERT3U(db->db.db_object, ==, dn->dn_object); - ASSERT3P(db->db_objset, ==, dn->dn_objset); - ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DMU_BONUS_BLKID || - db->db_blkid == DMU_SPILL_BLKID || - !avl_is_empty(&dn->dn_dbufs)); - } - if (db->db_blkid == DMU_BONUS_BLKID) { - ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); - } else if (db->db_blkid == DMU_SPILL_BLKID) { - ASSERT(dn != NULL); - ASSERT0(db->db.db_offset); - } else { - ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); - } - - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - /* - * We can't assert that db_size matches dn_datablksz because it - * can be momentarily different when another thread is doing - * dnode_set_blksz(). - */ - if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { - dr = db->db_data_pending; - /* - * It should only be modified in syncing context, so - * make sure we only have one copy of the data. - */ - ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); - } - - /* verify db->db_blkptr */ - if (db->db_blkptr) { - if (db->db_parent == dn->dn_dbuf) { - /* db is pointed to by the dnode */ - /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) - ASSERT(db->db_parent == NULL); - else - ASSERT(db->db_parent != NULL); - if (db->db_blkid != DMU_SPILL_BLKID) - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - } else { - /* db is pointed to by an indirect block */ - int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; - ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); - ASSERT3U(db->db_parent->db.db_object, ==, - db->db.db_object); - /* - * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer - * grows. safe to do this now? - */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + - db->db_blkid % epb)); - } - } - } - if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - (db->db_buf == NULL || db->db_buf->b_data) && - db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && !dn->dn_free_txg) { - /* - * If the blkptr isn't set but they have nonzero data, - * it had better be dirty, otherwise we'll lose that - * data when we evict this buffer. - * - * There is an exception to this rule for indirect blocks; in - * this case, if the indirect block is a hole, we fill in a few - * fields on each of the child blocks (importantly, birth time) - * to prevent hole birth times from being lost when you - * partially fill in a hole. - */ - if (db->db_dirtycnt == 0) { - if (db->db_level == 0) { - uint64_t *buf = db->db.db_data; - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } - } else { - blkptr_t *bps = db->db.db_data; - ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, - db->db.db_size); - /* - * We want to verify that all the blkptrs in the - * indirect block are holes, but we may have - * automatically set up a few fields for them. - * We iterate through each blkptr and verify - * they only have those fields set. - */ - for (int i = 0; - i < db->db.db_size / sizeof (blkptr_t); - i++) { - blkptr_t *bp = &bps[i]; - ASSERT(ZIO_CHECKSUM_IS_ZERO( - &bp->blk_cksum)); - ASSERT( - DVA_IS_EMPTY(&bp->blk_dva[0]) && - DVA_IS_EMPTY(&bp->blk_dva[1]) && - DVA_IS_EMPTY(&bp->blk_dva[2])); - ASSERT0(bp->blk_fill); - ASSERT0(bp->blk_pad[0]); - ASSERT0(bp->blk_pad[1]); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(BP_IS_HOLE(bp)); - ASSERT0(bp->blk_phys_birth); - } - } - } - } - DB_DNODE_EXIT(db); -} -#endif - -static void -dbuf_clear_data(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; -} - -static void -dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(buf != NULL); - - db->db_buf = buf; - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; -} - -/* - * Loan out an arc_buf for read. Return the loaned arc_buf. - */ -arc_buf_t * -dbuf_loan_arcbuf(dmu_buf_impl_t *db) -{ - arc_buf_t *abuf; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { - int blksz = db->db.db_size; - spa_t *spa = db->db_objset->os_spa; - - mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(spa, B_FALSE, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); - } else { - abuf = db->db_buf; - arc_loan_inuse_buf(abuf, db); - db->db_buf = NULL; - dbuf_clear_data(db); - mutex_exit(&db->db_mtx); - } - return (abuf); -} - -/* - * Calculate which level n block references the data at the level 0 offset - * provided. - */ -uint64_t -dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) -{ - if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { - /* - * The level n blkid is equal to the level 0 blkid divided by - * the number of level 0s in a level n block. - * - * The level 0 blkid is offset >> datablkshift = - * offset / 2^datablkshift. - * - * The number of level 0s in a level n is the number of block - * pointers in an indirect block, raised to the power of level. - * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = - * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). - * - * Thus, the level n blkid is: offset / - * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) - * = offset / 2^(datablkshift + level * - * (indblkshift - SPA_BLKPTRSHIFT)) - * = offset >> (datablkshift + level * - * (indblkshift - SPA_BLKPTRSHIFT)) - */ - return (offset >> (dn->dn_datablkshift + level * - (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); - } else { - ASSERT3U(offset, <, dn->dn_datablksz); - return (0); - } -} - -static void -dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - - mutex_enter(&db->db_mtx); - ASSERT3U(db->db_state, ==, DB_READ); - /* - * All reads are synchronous, so we must have a hold on the dbuf - */ - ASSERT(zfs_refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - if (buf == NULL) { - /* i/o error */ - ASSERT(zio == NULL || zio->io_error != 0); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - db->db_state = DB_UNCACHED; - } else if (db->db_level == 0 && db->db_freed_in_flight) { - /* freed in flight */ - ASSERT(zio == NULL || zio->io_error == 0); - if (buf == NULL) { - buf = arc_alloc_buf(db->db_objset->os_spa, - db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); - } - arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); - db->db_freed_in_flight = FALSE; - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } else { - /* success */ - ASSERT(zio == NULL || zio->io_error == 0); - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } - cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL, B_FALSE); -} - -static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) -{ - dnode_t *dn; - zbookmark_phys_t zb; - arc_flags_t aflags = ARC_FLAG_NOWAIT; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); - ASSERT(db->db_buf == NULL); - - if (db->db_blkid == DMU_BONUS_BLKID) { - /* - * The bonus length stored in the dnode may be less than - * the maximum available space in the bonus buffer. - */ - int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); - int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - - ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(max_bonuslen); - arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); - if (bonuslen < max_bonuslen) - bzero(db->db.db_data, max_bonuslen); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - /* - * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() - * processes the delete record and clears the bp while we are waiting - * for the dn_mtx (resulting in a "no" from block_freed). - */ - if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || - (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr)))) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, - db->db.db_size)); - bzero(db->db.db_data, db->db.db_size); - - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { - blkptr_t *bps = db->db.db_data; - for (int i = 0; i < ((1 << - DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); - i++) { - blkptr_t *bp = &bps[i]; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - 1 << dn->dn_indblkshift); - BP_SET_LSIZE(bp, - BP_GET_LEVEL(db->db_blkptr) == 1 ? - dn->dn_datablksz : - BP_GET_LSIZE(db->db_blkptr)); - BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); - BP_SET_LEVEL(bp, - BP_GET_LEVEL(db->db_blkptr) - 1); - BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); - } - } - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - DB_DNODE_EXIT(db); - - db->db_state = DB_READ; - mutex_exit(&db->db_mtx); - - if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_FLAG_L2CACHE; - - SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, - db->db.db_object, db->db_level, db->db_blkid); - - dbuf_add_ref(db, NULL); - - (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); -} - -/* - * This is our just-in-time copy function. It makes a copy of buffers that - * have been modified in a previous transaction group before we access them in - * the current active group. - * - * This function is used in three places: when we are dirtying a buffer for the - * first time in a txg, when we are freeing a range in a dnode that includes - * this buffer, and when we are accessing a buffer which was received compressed - * and later referenced in a WRITE_BYREF record. - * - * Note that when we are called from dbuf_free_range() we do not put a hold on - * the buffer, we just traverse the active dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dnode_t *dn = DB_DNODE(db); - int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); - arc_space_consume(bonuslen, ARC_SPACE_BONUS); - bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); - } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = arc_buf_size(db->db_buf); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - enum zio_compress compress_type = - arc_get_compression(db->db_buf); - - if (compress_type == ZIO_COMPRESS_OFF) { - dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, - size, arc_buf_lsize(db->db_buf), compress_type); - } - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - db->db_buf = NULL; - dbuf_clear_data(db); - } -} - -int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) -{ - int err = 0; - boolean_t prefetch; - dnode_t *dn; - - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - if (db->db_state == DB_NOFILL) - return (SET_ERROR(EIO)); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && - DBUF_IS_CACHEABLE(db); - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - /* - * If the arc buf is compressed, we need to decompress it to - * read the data. This could happen during the "zfs receive" of - * a stream which is compressed and deduplicated. - */ - if (db->db_buf != NULL && - arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { - dbuf_fix_old_data(db, - spa_syncing_txg(dmu_objset_spa(db->db_objset))); - err = arc_decompress(db->db_buf); - dbuf_set_data(db, db->db_buf); - } - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED) { - spa_t *spa = dn->dn_objset->os_spa; - boolean_t need_wait = B_FALSE; - - if (zio == NULL && - db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - need_wait = B_TRUE; - } - dbuf_read_impl(db, zio, flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - if (need_wait) - err = zio_wait(zio); - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* Skip the wait per the caller's request. */ - mutex_enter(&db->db_mtx); - if ((flags & DB_RF_NEVERWAIT) == 0) { - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, zio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - } - mutex_exit(&db->db_mtx); - } - - return (err); -} - -static void -dbuf_noread(dmu_buf_impl_t *db) -{ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); - db->db_state = DB_FILL; - } else if (db->db_state == DB_NOFILL) { - dbuf_clear_data(db); - } else { - ASSERT3U(db->db_state, ==, DB_CACHED); - } - mutex_exit(&db->db_mtx); -} - -void -dbuf_unoverride(dbuf_dirty_record_t *dr) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - blkptr_t *bp = &dr->dt.dl.dr_overridden_by; - uint64_t txg = dr->dr_txg; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - /* - * This assert is valid because dmu_sync() expects to be called by - * a zilog's get_data while holding a range lock. This call only - * comes from dbuf_dirty() callers who must also hold a range lock. - */ - ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); - ASSERT(db->db_level == 0); - - if (db->db_blkid == DMU_BONUS_BLKID || - dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) - return; - - ASSERT(db->db_data_pending != dr); - - /* free this block */ - if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) - zio_free(db->db_objset->os_spa, txg, bp); - - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - dr->dt.dl.dr_nopwrite = B_FALSE; - - /* - * Release the already-written buffer, so we leave it in - * a consistent dirty state. Note that all callers are - * modifying the buffer, so they will immediately do - * another (redundant) arc_release(). Therefore, leave - * the buf thawed to save the effort of freezing & - * immediately re-thawing it. - */ - arc_release(dr->dt.dl.dr_data, db); -} - -/* - * Evict (if its unreferenced) or clear (if its referenced) any level-0 - * data blocks in the free range, so that any future readers will find - * empty blocks. - */ -void -dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, - dmu_tx_t *tx) -{ - dmu_buf_impl_t db_search; - dmu_buf_impl_t *db, *db_next; - uint64_t txg = tx->tx_txg; - avl_index_t where; - - if (end_blkid > dn->dn_maxblkid && - !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) - end_blkid = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); - - db_search.db_level = 0; - db_search.db_blkid = start_blkid; - db_search.db_state = DB_SEARCH; - - mutex_enter(&dn->dn_dbufs_mtx); - db = avl_find(&dn->dn_dbufs, &db_search, &where); - ASSERT3P(db, ==, NULL); - - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - - for (; db != NULL; db = db_next) { - db_next = AVL_NEXT(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - if (db->db_level != 0 || db->db_blkid > end_blkid) { - break; - } - ASSERT3U(db->db_blkid, >=, start_blkid); - - /* found a level 0 buffer in the range */ - mutex_enter(&db->db_mtx); - if (dbuf_undirty(db, tx)) { - /* mutex has been dropped and dbuf destroyed */ - continue; - } - - if (db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL || - db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } - if (db->db_state == DB_READ || db->db_state == DB_FILL) { - /* will be handled in dbuf_read_done or dbuf_rele */ - db->db_freed_in_flight = TRUE; - mutex_exit(&db->db_mtx); - continue; - } - if (zfs_refcount_count(&db->db_holds) == 0) { - ASSERT(db->db_buf); - dbuf_destroy(db); - continue; - } - /* The dbuf is referenced */ - - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - if (dr->dr_txg == txg) { - /* - * This buffer is "in-use", re-adjust the file - * size to reflect that this buffer may - * contain new data when we sync. - */ - if (db->db_blkid != DMU_SPILL_BLKID && - db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; - dbuf_unoverride(dr); - } else { - /* - * This dbuf is not dirty in the open context. - * Either uncache it (if its not referenced in - * the open context) or reset its contents to - * empty. - */ - dbuf_fix_old_data(db, txg); - } - } - /* clear the contents if its cached */ - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); - arc_buf_freeze(db->db_buf); - } - - mutex_exit(&db->db_mtx); - } - mutex_exit(&dn->dn_dbufs_mtx); -} - -void -dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) -{ - arc_buf_t *buf, *obuf; - int osize = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dnode_t *dn; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ - /* - * XXX we should be doing a dbuf_read, checking the return - * value and returning that up to our callers - */ - dmu_buf_will_dirty(&db->db, tx); - - /* create the data buffer for the new block */ - buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); - - /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); - /* zero the remainder */ - if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); - - mutex_enter(&db->db_mtx); - dbuf_set_data(db, buf); - arc_buf_destroy(obuf, db); - db->db.db_size = size; - - if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; - } - mutex_exit(&db->db_mtx); - - dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); - DB_DNODE_EXIT(db); -} - -void -dbuf_release_bp(dmu_buf_impl_t *db) -{ - objset_t *os = db->db_objset; - - ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); - ASSERT(arc_released(os->os_phys_buf) || - list_link_active(&os->os_dsl_dataset->ds_synced_link)); - ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); - - (void) arc_release(db->db_buf, db); -} - -/* - * We already have a dirty record for this TXG, and we are being - * dirtied again. - */ -static void -dbuf_redirty(dbuf_dirty_record_t *dr) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* Already released on initial dirty, so just thaw. */ - ASSERT(arc_released(db->db_buf)); - arc_buf_thaw(db->db_buf); - } - } -} - -dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn; - objset_t *os; - dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; - int txgoff = tx->tx_txg & TXG_MASK; - - ASSERT(tx->tx_txg != 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - DMU_TX_DIRTY_BUF(tx, db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - /* - * Shouldn't dirty a regular buffer in syncing context. Private - * objects may be dirtied in syncing context, but only if they - * were already pre-dirtied in open context. - */ -#ifdef DEBUG - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - RW_READER, FTAG); - } - ASSERT(!dmu_tx_is_syncing(tx) || - BP_IS_HOLE(dn->dn_objset->os_rootbp) || - DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - dn->dn_objset->os_dsl_dataset == NULL); - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); -#endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - mutex_enter(&db->db_mtx); - /* - * XXX make this true for indirects too? The problem is that - * transactions created with dmu_tx_create_assigned() from - * syncing context don't bother holding ahead. - */ - ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL); - - mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - RW_READER, FTAG); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? - DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - FTAG); - } - } - - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - - if (db->db_blkid == DMU_SPILL_BLKID) - dn->dn_have_spill = B_TRUE; - - /* - * If this buffer is already dirty, we're done. - */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; - if (dr && dr->dr_txg == tx->tx_txg) { - DB_DNODE_EXIT(db); - - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return (dr); - } - - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - ASSERT3U(dn->dn_nlevels, >, db->db_level); - - /* - * We should only be dirtying in syncing context if it's the - * mos or we're initializing the os or it's a special object. - * However, we are allowed to dirty in syncing context provided - * we already dirtied it in open context. Hence we must make - * this assertion only if we're not already dirty. - */ - os = dn->dn_objset; - VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); -#ifdef DEBUG - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); -#endif - ASSERT(db->db.db_size != 0); - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - if (db->db_blkid != DMU_BONUS_BLKID) { - dmu_objset_willuse_space(os, db->db.db_size, tx); - } - - /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. - */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - list_link_init(&dr->dr_dirty_node); - if (db->db_level == 0) { - void *data_old = db->db_buf; - - if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so - * that we can modify it without impacting - * possible other users of this cached data - * block. Note that indirect blocks and - * private objects are not released until the - * syncing state (since they are only modified - * then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; - } - ASSERT(data_old != NULL); - } - dr->dt.dl.dr_data = data_old; - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) - dr->dr_accounted = db->db.db_size; - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; - - /* - * We could have been freed_in_flight between the dbuf_noread - * and dbuf_dirty. We win, as though the dbuf_noread() had - * happened after the free. - */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_free_ranges[txgoff] != NULL) { - range_tree_clear(dn->dn_free_ranges[txgoff], - db->db_blkid, 1); - } - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; - } - - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, 3); - - mutex_exit(&db->db_mtx); - - if (db->db_blkid == DMU_BONUS_BLKID || - db->db_blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); - } - - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - /* - * We need to hold the dn_struct_rwlock to make this assertion, - * because it protects dn_phys / dn_next_nlevels from changing. - */ - ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || - dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); - ASSERT(dn->dn_maxblkid >= db->db_blkid); - } - - if (db->db_level+1 < dn->dn_nlevels) { - dmu_buf_impl_t *parent = db->db_parent; - dbuf_dirty_record_t *di; - int parent_held = FALSE; - - if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, - db->db_blkid >> epbs, FTAG); - ASSERT(parent != NULL); - parent_held = TRUE; - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); - if (parent_held) - dbuf_rele(parent, FTAG); - - mutex_enter(&db->db_mtx); - /* - * Since we've dropped the mutex, it's possible that - * dbuf_undirty() might have changed this out from under us. - */ - if (db->db_last_dirty == dr || - dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); - ASSERT3U(di->dr_txg, ==, tx->tx_txg); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); - dr->dr_parent = di; - } - mutex_exit(&db->db_mtx); - } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); - ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - } - - dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); -} - -/* - * Undirty a buffer in the transaction group referenced by the given - * transaction. Return whether this evicted the dbuf. - */ -static boolean_t -dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn; - uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; - - ASSERT(txg != 0); - - /* - * Due to our use of dn_nlevels below, this can only be called - * in open context, unless we are operating on the MOS. - * From syncing context, dn_nlevels may be different from the - * dn_nlevels used when dbuf was dirtied. - */ - ASSERT(db->db_objset == - dmu_objset_pool(db->db_objset)->dp_meta_objset || - txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT0(db->db_level); - ASSERT(MUTEX_HELD(&db->db_mtx)); - - /* - * If this buffer is not dirty, we're done. - */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg <= txg) - break; - if (dr == NULL || dr->dr_txg < txg) - return (B_FALSE); - ASSERT(dr->dr_txg == txg); - ASSERT(dr->dr_dbuf == db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - ASSERT(db->db.db_size != 0); - - dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), - dr->dr_accounted, txg); - - *drp = dr->dr_next; - - /* - * Note that there are three places in dbuf_dirty() - * where this dirty record may be put on a list. - * Make sure to do a list_remove corresponding to - * every one of those list_insert calls. - */ - if (dr->dr_parent) { - mutex_enter(&dr->dr_parent->dt.di.dr_mtx); - list_remove(&dr->dr_parent->dt.di.dr_children, dr); - mutex_exit(&dr->dr_parent->dt.di.dr_mtx); - } else if (db->db_blkid == DMU_SPILL_BLKID || - db->db_level + 1 == dn->dn_nlevels) { - ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); - mutex_exit(&dn->dn_mtx); - } - DB_DNODE_EXIT(db); - - if (db->db_state != DB_NOFILL) { - dbuf_unoverride(dr); - - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); - } - - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - - if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); - dbuf_destroy(db); - return (B_TRUE); - } - - return (B_FALSE); -} - -void -dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; - - ASSERT(tx->tx_txg != 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - /* - * Quick check for dirtyness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). - */ - mutex_enter(&db->db_mtx); - dbuf_dirty_record_t *dr; - for (dr = db->db_last_dirty; - dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { - /* - * It's possible that it is already dirty but not cached, - * because there are some calls to dbuf_dirty() that don't - * go through dmu_buf_will_dirty(). - */ - if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; - } - } - mutex_exit(&db->db_mtx); - - DB_DNODE_ENTER(db); - if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); - (void) dbuf_dirty(db, tx); -} - -void -dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_state = DB_NOFILL; - - dmu_buf_will_fill(db_fake, tx); -} - -void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(tx->tx_txg != 0); - ASSERT(db->db_level == 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || - dmu_tx_private_ok(tx)); - - dbuf_noread(db); - (void) dbuf_dirty(db, tx); -} - -#pragma weak dmu_buf_fill_done = dbuf_fill_done -/* ARGSUSED */ -void -dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - mutex_enter(&db->db_mtx); - DBUF_VERIFY(db); - - if (db->db_state == DB_FILL) { - if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - /* we were freed while filling */ - /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); - db->db_freed_in_flight = FALSE; - } - db->db_state = DB_CACHED; - cv_broadcast(&db->db_changed); - } - mutex_exit(&db->db_mtx); -} - -void -dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, - bp_embedded_type_t etype, enum zio_compress comp, - int uncompressed_size, int compressed_size, int byteorder, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - struct dirty_leaf *dl; - dmu_object_type_t type; - - if (etype == BP_EMBEDDED_TYPE_DATA) { - ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), - SPA_FEATURE_EMBEDDED_DATA)); - } - - DB_DNODE_ENTER(db); - type = DB_DNODE(db)->dn_type; - DB_DNODE_EXIT(db); - - ASSERT0(db->db_level); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - dmu_buf_will_not_fill(dbuf, tx); - - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - dl = &db->db_last_dirty->dt.dl; - encode_embedded_bp_compressed(&dl->dr_overridden_by, - data, comp, uncompressed_size, compressed_size); - BPE_SET_ETYPE(&dl->dr_overridden_by, etype); - BP_SET_TYPE(&dl->dr_overridden_by, type); - BP_SET_LEVEL(&dl->dr_overridden_by, 0); - BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); - - dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; -} - -/* - * Directly assign a provided arc buf to a given dbuf if it's not referenced - * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. - */ -void -dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) -{ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_level == 0); - ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); - ASSERT(buf != NULL); - ASSERT(arc_buf_lsize(buf) == db->db.db_size); - ASSERT(tx->tx_txg != 0); - - arc_return_buf(buf, db); - ASSERT(arc_released(buf)); - - mutex_enter(&db->db_mtx); - - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); - - if (db->db_state == DB_CACHED && - zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); - arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); - return; - } - - xuio_stat_wbuf_nocopy(); - if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(db->db_buf != NULL); - if (dr != NULL && dr->dr_txg == tx->tx_txg) { - ASSERT(dr->dt.dl.dr_data == db->db_buf); - if (!arc_released(db->db_buf)) { - ASSERT(dr->dt.dl.dr_override_state == - DR_OVERRIDDEN); - arc_release(db->db_buf, db); - } - dr->dt.dl.dr_data = buf; - arc_buf_destroy(db->db_buf, db); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { - arc_release(db->db_buf, db); - arc_buf_destroy(db->db_buf, db); - } - db->db_buf = NULL; - } - ASSERT(db->db_buf == NULL); - dbuf_set_data(db, buf); - db->db_state = DB_FILL; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); -} - -void -dbuf_destroy(dmu_buf_impl_t *db) -{ - dnode_t *dn; - dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - - if (db->db_buf != NULL) { - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - } - - if (db->db_blkid == DMU_BONUS_BLKID) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - if (db->db.db_data != NULL) { - zio_buf_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - db->db_state = DB_UNCACHED; - } - } - - dbuf_clear_data(db); - - if (multilist_link_active(&db->db_cache_link)) { - ASSERT(db->db_caching_status == DB_DBUF_CACHE || - db->db_caching_status == DB_DBUF_METADATA_CACHE); - - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); - (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); - - if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMPDOWN(metadata_cache_count); - } else { - DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); - } - db->db_caching_status = DB_NO_CACHE; - } - - ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_data_pending == NULL); - - db->db_state = DB_EVICTING; - db->db_blkptr = NULL; - - /* - * Now that db_state is DB_EVICTING, nobody else can find this via - * the hash table. We can now drop db_mtx, which allows us to - * acquire the dn_dbufs_mtx. - */ - mutex_exit(&db->db_mtx); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID) { - boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); - if (needlock) - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - membar_producer(); - DB_DNODE_EXIT(db); - if (needlock) - mutex_exit(&dn->dn_dbufs_mtx); - /* - * Decrementing the dbuf count means that the hold corresponding - * to the removed dbuf is no longer discounted in dnode_move(), - * so the dnode cannot be moved until after we release the hold. - * The membar_producer() ensures visibility of the decremented - * value in dnode_move(), since DB_DNODE_EXIT doesn't actually - * release any lock. - */ - mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, db, B_TRUE); - db->db_dnode_handle = NULL; - - dbuf_hash_remove(db); - } else { - DB_DNODE_EXIT(db); - } - - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - - db->db_parent = NULL; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); - ASSERT(!multilist_link_active(&db->db_cache_link)); - - kmem_cache_free(dbuf_kmem_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - - /* - * If this dbuf is referenced from an indirect dbuf, - * decrement the ref count on the indirect dbuf. - */ - if (parent && parent != dndb) { - mutex_enter(&parent->db_mtx); - dbuf_rele_and_unlock(parent, db, B_TRUE); - } -} - -/* - * Note: While bpp will always be updated if the function returns success, - * parentp will not be updated if the dnode does not have dn_dbuf filled in; - * this happens when the dnode is the meta-dnode, or a userused or groupused - * object. - */ -__attribute__((always_inline)) -static inline int -dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) -{ - *parentp = NULL; - *bpp = NULL; - - ASSERT(blkid != DMU_BONUS_BLKID); - - if (blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_have_spill && - (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) - *bpp = DN_SPILL_BLKPTR(dn->dn_phys); - else - *bpp = NULL; - dbuf_add_ref(dn->dn_dbuf, NULL); - *parentp = dn->dn_dbuf; - mutex_exit(&dn->dn_mtx); - return (0); - } - - int nlevels = - (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT3U(level * epbs, <, 64); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - /* - * This assertion shouldn't trip as long as the max indirect block size - * is less than 1M. The reason for this is that up to that point, - * the number of levels required to address an entire object with blocks - * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In - * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 - * (i.e. we can address the entire object), objects will all use at most - * N-1 levels and the assertion won't overflow. However, once epbs is - * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be - * enough to address an entire object, so objects will have 5 levels, - * but then this assertion will overflow. - * - * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we - * need to redo this logic to handle overflows. - */ - ASSERT(level >= nlevels || - ((nlevels - level - 1) * epbs) + - highbit64(dn->dn_phys->dn_nblkptr) <= 64); - if (level >= nlevels || - blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << - ((nlevels - level - 1) * epbs)) || - (fail_sparse && - blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { - /* the buffer has no parent yet */ - return (SET_ERROR(ENOENT)); - } else if (level < nlevels-1) { - /* this block is referenced from an indirect block */ - int err; - if (dh == NULL) { - err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, FALSE, NULL, parentp); - } else { - __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, - blkid >> epbs, fail_sparse, FALSE, NULL, - parentp, dh->dh_depth + 1); - err = __dbuf_hold_impl(dh + 1); - } - if (err) - return (err); - err = dbuf_read(*parentp, NULL, - (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); - if (err) { - dbuf_rele(*parentp, NULL); - *parentp = NULL; - return (err); - } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + - (blkid & ((1ULL << epbs) - 1)); - if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) - ASSERT(BP_IS_HOLE(*bpp)); - return (0); - } else { - /* the block is referenced from the dnode */ - ASSERT3U(level, ==, nlevels-1); - ASSERT(dn->dn_phys->dn_nblkptr == 0 || - blkid < dn->dn_phys->dn_nblkptr); - if (dn->dn_dbuf) { - dbuf_add_ref(dn->dn_dbuf, NULL); - *parentp = dn->dn_dbuf; - } - *bpp = &dn->dn_phys->dn_blkptr[blkid]; - return (0); - } -} - -static dmu_buf_impl_t * -dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - dmu_buf_impl_t *parent, blkptr_t *blkptr) -{ - objset_t *os = dn->dn_objset; - dmu_buf_impl_t *db, *odb; - - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT(dn->dn_type != DMU_OT_NONE); - - db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); - - db->db_objset = os; - db->db.db_object = dn->dn_object; - db->db_level = level; - db->db_blkid = blkid; - db->db_last_dirty = NULL; - db->db_dirtycnt = 0; - db->db_dnode_handle = dn->dn_handle; - db->db_parent = parent; - db->db_blkptr = blkptr; - - db->db_user = NULL; - db->db_user_immediate_evict = FALSE; - db->db_freed_in_flight = FALSE; - db->db_pending_evict = FALSE; - - if (blkid == DMU_BONUS_BLKID) { - ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr-1) * sizeof (blkptr_t); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - db->db.db_offset = DMU_BONUS_BLKID; - db->db_state = DB_UNCACHED; - db->db_caching_status = DB_NO_CACHE; - /* the bonus dbuf is not placed in the hash table */ - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - return (db); - } else if (blkid == DMU_SPILL_BLKID) { - db->db.db_size = (blkptr != NULL) ? - BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; - db->db.db_offset = 0; - } else { - int blocksize = - db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; - db->db.db_size = blocksize; - db->db.db_offset = db->db_blkid * blocksize; - } - - /* - * Hold the dn_dbufs_mtx while we get the new dbuf - * in the hash table *and* added to the dbufs list. - * This prevents a possible deadlock with someone - * trying to look up this dbuf before its added to the - * dn_dbufs list. - */ - mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; - if ((odb = dbuf_hash_insert(db)) != NULL) { - /* someone else inserted it first */ - kmem_cache_free(dbuf_kmem_cache, db); - mutex_exit(&dn->dn_dbufs_mtx); - DBUF_STAT_BUMP(hash_insert_race); - return (odb); - } - avl_add(&dn->dn_dbufs, db); - - db->db_state = DB_UNCACHED; - db->db_caching_status = DB_NO_CACHE; - mutex_exit(&dn->dn_dbufs_mtx); - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - - if (parent && parent != dn->dn_dbuf) - dbuf_add_ref(parent, db); - - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - zfs_refcount_count(&dn->dn_holds) > 0); - (void) zfs_refcount_add(&dn->dn_holds, db); - - dprintf_dbuf(db, "db=%p\n", db); - - return (db); -} - -typedef struct dbuf_prefetch_arg { - spa_t *dpa_spa; /* The spa to issue the prefetch in. */ - zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ - int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ - int dpa_curlevel; /* The current level that we're reading */ - dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ - zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ - zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ - arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ -} dbuf_prefetch_arg_t; - -/* - * Actually issue the prefetch read for the block given. - */ -static void -dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) -{ - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return; - - arc_flags_t aflags = - dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - - ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); - ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); - ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, - dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &dpa->dpa_zb); -} - -/* - * Called when an indirect block above our prefetch target is read in. This - * will either read in the next indirect block down the tree or issue the actual - * prefetch if the next block down is our target. - */ -static void -dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, - const blkptr_t *iobp, arc_buf_t *abuf, void *private) -{ - dbuf_prefetch_arg_t *dpa = private; - - ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); - ASSERT3S(dpa->dpa_curlevel, >, 0); - - if (abuf == NULL) { - ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; - } - ASSERT(zio == NULL || zio->io_error == 0); - - /* - * The dpa_dnode is only valid if we are called with a NULL - * zio. This indicates that the arc_read() returned without - * first calling zio_read() to issue a physical read. Once - * a physical read is made the dpa_dnode must be invalidated - * as the locks guarding it may have been dropped. If the - * dpa_dnode is still valid, then we want to add it to the dbuf - * cache. To do so, we must hold the dbuf associated with the block - * we just prefetched, read its contents so that we associate it - * with an arc_buf_t, and then release it. - */ - if (zio != NULL) { - ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - if (zio->io_flags & ZIO_FLAG_RAW) { - ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); - } else { - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); - } - ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); - - dpa->dpa_dnode = NULL; - } else if (dpa->dpa_dnode != NULL) { - uint64_t curblkid = dpa->dpa_zb.zb_blkid >> - (dpa->dpa_epbs * (dpa->dpa_curlevel - - dpa->dpa_zb.zb_level)); - dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, - dpa->dpa_curlevel, curblkid, FTAG); - (void) dbuf_read(db, NULL, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); - dbuf_rele(db, FTAG); - } - - if (abuf == NULL) { - kmem_free(dpa, sizeof(*dpa)); - return; - } - - dpa->dpa_curlevel--; - - uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> - (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); - blkptr_t *bp = ((blkptr_t *)abuf->b_data) + - P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp)) { - kmem_free(dpa, sizeof (*dpa)); - } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { - ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); - dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); - } else { - arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; - zbookmark_phys_t zb; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) - iter_aflags |= ARC_FLAG_L2CACHE; - - ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); - - SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, - dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); - - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &iter_aflags, &zb); - } - - arc_buf_destroy(abuf, private); -} - -/* - * Issue prefetch reads for the given block on the given level. If the indirect - * blocks above that block are not in memory, we will read them in - * asynchronously. As a result, this call never blocks waiting for a read to - * complete. - */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) -{ - blkptr_t bp; - int epbs, nlevels, curlevel; - uint64_t curblkid; - - ASSERT(blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - - if (blkid > dn->dn_maxblkid) - return; - - if (dnode_block_freed(dn, blkid)) - return; - - /* - * This dnode hasn't been written to disk yet, so there's nothing to - * prefetch. - */ - nlevels = dn->dn_phys->dn_nlevels; - if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; - - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; - - dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, - level, blkid); - if (db != NULL) { - mutex_exit(&db->db_mtx); - /* - * This dbuf already exists. It is either CACHED, or - * (we assume) about to be read or filled. - */ - return; - } - - /* - * Find the closest ancestor (indirect block) of the target block - * that is present in the cache. In this indirect block, we will - * find the bp that is at curlevel, curblkid. - */ - curlevel = level; - curblkid = blkid; - while (curlevel < nlevels - 1) { - int parent_level = curlevel + 1; - uint64_t parent_blkid = curblkid >> epbs; - dmu_buf_impl_t *db; - - if (dbuf_hold_impl(dn, parent_level, parent_blkid, - FALSE, TRUE, FTAG, &db) == 0) { - blkptr_t *bpp = db->db_buf->b_data; - bp = bpp[P2PHASE(curblkid, 1 << epbs)]; - dbuf_rele(db, FTAG); - break; - } - - curlevel = parent_level; - curblkid = parent_blkid; - } - - if (curlevel == nlevels - 1) { - /* No cached indirect blocks found. */ - ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); - bp = dn->dn_phys->dn_blkptr[curblkid]; - } - if (BP_IS_HOLE(&bp)) - return; - - ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); - - zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, - ZIO_FLAG_CANFAIL); - - dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, level, blkid); - dpa->dpa_curlevel = curlevel; - dpa->dpa_prio = prio; - dpa->dpa_aflags = aflags; - dpa->dpa_spa = dn->dn_objset->os_spa; - dpa->dpa_dnode = dn; - dpa->dpa_epbs = epbs; - dpa->dpa_zio = pio; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) - dpa->dpa_aflags |= ARC_FLAG_L2CACHE; - - /* - * If we have the indirect just above us, no need to do the asynchronous - * prefetch chain; we'll just run the last step ourselves. If we're at - * a higher level, though, we want to issue the prefetches for all the - * indirect blocks asynchronously, so we can go on with whatever we were - * doing. - */ - if (curlevel == level) { - ASSERT3U(curblkid, ==, blkid); - dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); - } else { - arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; - zbookmark_phys_t zb; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) - iter_aflags |= ARC_FLAG_L2CACHE; - - SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, curlevel, curblkid); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - &bp, dbuf_prefetch_indirect_done, dpa, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &iter_aflags, &zb); - } - /* - * We use pio here instead of dpa_zio since it's possible that - * dpa may have already been freed. - */ - zio_nowait(pio); -} - -#define DBUF_HOLD_IMPL_MAX_DEPTH 20 - -/* - * Helper function for __dbuf_hold_impl() to copy a buffer. Handles - * the case of encrypted, compressed and uncompressed buffers by - * allocating the new buffer, respectively, with arc_alloc_raw_buf(), - * arc_alloc_compressed_buf() or arc_alloc_buf().* - * - * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). - */ -noinline static void -dbuf_hold_copy(struct dbuf_hold_impl_data *dh) -{ - dnode_t *dn = dh->dh_dn; - dmu_buf_impl_t *db = dh->dh_db; - dbuf_dirty_record_t *dr = dh->dh_dr; - arc_buf_t *data = dr->dt.dl.dr_data; - - enum zio_compress compress_type = arc_get_compression(data); - - if (compress_type != ZIO_COMPRESS_OFF) { - dbuf_set_data(db, arc_alloc_compressed_buf( - dn->dn_objset->os_spa, db, arc_buf_size(data), - arc_buf_lsize(data), compress_type)); - } else { - dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, - DBUF_GET_BUFC_TYPE(db), db->db.db_size)); - } - - bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); -} - -/* - * Returns with db_holds incremented, and db_mtx not held. - * Note: dn_struct_rwlock must be held. - */ -static int -__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) -{ - ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); - dh->dh_parent = NULL; - - ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); - ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); - - *(dh->dh_dbp) = NULL; - - /* dbuf_find() returns with db_mtx held */ - dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, - dh->dh_level, dh->dh_blkid); - - if (dh->dh_db == NULL) { - dh->dh_bp = NULL; - - if (dh->dh_fail_uncached) - return (SET_ERROR(ENOENT)); - - ASSERT3P(dh->dh_parent, ==, NULL); - dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); - if (dh->dh_fail_sparse) { - if (dh->dh_err == 0 && - dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) - dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - return (dh->dh_err); - } - } - if (dh->dh_err && dh->dh_err != ENOENT) - return (dh->dh_err); - dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_parent, dh->dh_bp); - } - - if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { - mutex_exit(&dh->dh_db->db_mtx); - return (SET_ERROR(ENOENT)); - } - - if (dh->dh_db->db_buf != NULL) { - arc_buf_access(dh->dh_db->db_buf); - ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); - } - - ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); - - /* - * If this buffer is currently syncing out, and we are are - * still referencing it from db_data, we need to make a copy - * of it in case we decide we want to dirty it again in this txg. - */ - if (dh->dh_db->db_level == 0 && - dh->dh_db->db_blkid != DMU_BONUS_BLKID && - dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && - dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { - dh->dh_dr = dh->dh_db->db_data_pending; - if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) - dbuf_hold_copy(dh); - } - - if (multilist_link_active(&dh->dh_db->db_cache_link)) { - ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds)); - ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || - dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); - - multilist_remove( - dbuf_caches[dh->dh_db->db_caching_status].cache, - dh->dh_db); - (void) zfs_refcount_remove_many( - &dbuf_caches[dh->dh_db->db_caching_status].size, - dh->dh_db->db.db_size, dh->dh_db); - - if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMPDOWN(metadata_cache_count); - } else { - DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], - dh->dh_db->db.db_size); - } - dh->dh_db->db_caching_status = DB_NO_CACHE; - } - (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag); - DBUF_VERIFY(dh->dh_db); - mutex_exit(&dh->dh_db->db_mtx); - - /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - - ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); - ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); - ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); - *(dh->dh_dbp) = dh->dh_db; - - return (0); -} - -/* - * The following code preserves the recursive function dbuf_hold_impl() - * but moves the local variables AND function arguments to the heap to - * minimize the stack frame size. Enough space is initially allocated - * on the stack for 20 levels of recursion. - */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) -{ - struct dbuf_hold_impl_data *dh; - int error; - - dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); - __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, - fail_uncached, tag, dbp, 0); - - error = __dbuf_hold_impl(dh); - - kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH); - - return (error); -} - -static void -__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp, int depth) -{ - dh->dh_dn = dn; - dh->dh_level = level; - dh->dh_blkid = blkid; - - dh->dh_fail_sparse = fail_sparse; - dh->dh_fail_uncached = fail_uncached; - - dh->dh_tag = tag; - dh->dh_dbp = dbp; - - dh->dh_db = NULL; - dh->dh_parent = NULL; - dh->dh_bp = NULL; - dh->dh_err = 0; - dh->dh_dr = NULL; - - dh->dh_depth = depth; -} - -dmu_buf_impl_t * -dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) -{ - return (dbuf_hold_level(dn, 0, blkid, tag)); -} - -dmu_buf_impl_t * -dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) -{ - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); - return (err ? NULL : db); -} - -void -dbuf_create_bonus(dnode_t *dn) -{ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); -} - -int -dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - - if (db->db_blkid != DMU_SPILL_BLKID) - return (SET_ERROR(ENOTSUP)); - if (blksz == 0) - blksz = SPA_MINBLOCKSIZE; - ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); - blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - - return (0); -} - -void -dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) -{ - dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); -} - -#pragma weak dmu_buf_add_ref = dbuf_add_ref -void -dbuf_add_ref(dmu_buf_impl_t *db, void *tag) -{ - int64_t holds = zfs_refcount_add(&db->db_holds, tag); - ASSERT3S(holds, >, 1); -} - -#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref -boolean_t -dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, - void *tag) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dmu_buf_impl_t *found_db; - boolean_t result = B_FALSE; - - if (db->db_blkid == DMU_BONUS_BLKID) - found_db = dbuf_find_bonus(os, obj); - else - found_db = dbuf_find(os, obj, 0, blkid); - - if (found_db != NULL) { - if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { - (void) zfs_refcount_add(&db->db_holds, tag); - result = B_TRUE; - } - mutex_exit(&db->db_mtx); - } - return (result); -} - -/* - * If you call dbuf_rele() you had better not be referencing the dnode handle - * unless you have some other direct or indirect hold on the dnode. (An indirect - * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) - * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the - * dnode's parent dbuf evicting its dnode handles. - */ -void -dbuf_rele(dmu_buf_impl_t *db, void *tag) -{ - mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag, B_FALSE); -} - -void -dmu_buf_rele(dmu_buf_t *db, void *tag) -{ - dbuf_rele((dmu_buf_impl_t *)db, tag); -} - -/* - * dbuf_rele() for an already-locked dbuf. This is necessary to allow - * db_dirtycnt and db_holds to be updated atomically. The 'evicting' - * argument should be set if we are already in the dbuf-evicting code - * path, in which case we don't want to recursively evict. This allows us to - * avoid deeply nested stacks that would have a call flow similar to this: - * - * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() - * ^ | - * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ - * - */ -void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) -{ - int64_t holds; - uint64_t size; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - DBUF_VERIFY(db); - - /* - * Remove the reference to the dbuf before removing its hold on the - * dnode so we can guarantee in dnode_move() that a referenced bonus - * buffer has a corresponding dnode hold. - */ - holds = zfs_refcount_remove(&db->db_holds, tag); - ASSERT(holds >= 0); - - /* - * We can't freeze indirects if there is a possibility that they - * may be modified in the current syncing context. - */ - if (db->db_buf != NULL && - holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { - arc_buf_freeze(db->db_buf); - } - - if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_user_immediate_evict) - dbuf_evict_user(db); - - if (holds == 0) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dnode_t *dn; - boolean_t evict_dbuf = db->db_pending_evict; - - /* - * If the dnode moves here, we cannot cross this - * barrier until the move completes. - */ - DB_DNODE_ENTER(db); - - dn = DB_DNODE(db); - atomic_dec_32(&dn->dn_dbufs_count); - - /* - * Decrementing the dbuf count means that the bonus - * buffer's dnode hold is no longer discounted in - * dnode_move(). The dnode cannot move until after - * the dnode_rele() below. - */ - DB_DNODE_EXIT(db); - - /* - * Do not reference db after its lock is dropped. - * Another thread may evict it. - */ - mutex_exit(&db->db_mtx); - - if (evict_dbuf) - dnode_evict_bonus(dn); - - dnode_rele(dn, db); - } else if (db->db_buf == NULL) { - /* - * This is a special case: we never associated this - * dbuf with any data allocated from the ARC. - */ - ASSERT(db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL); - dbuf_destroy(db); - } else if (arc_released(db->db_buf)) { - /* - * This dbuf has anonymous data associated with it. - */ - dbuf_destroy(db); - } else { - boolean_t do_arc_evict = B_FALSE; - blkptr_t bp; - spa_t *spa = dmu_objset_spa(db->db_objset); - - if (!DBUF_IS_CACHEABLE(db) && - db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - do_arc_evict = B_TRUE; - bp = *db->db_blkptr; - } - - if (!DBUF_IS_CACHEABLE(db) || - db->db_pending_evict) { - dbuf_destroy(db); - } else if (!multilist_link_active(&db->db_cache_link)) { - ASSERT3U(db->db_caching_status, ==, - DB_NO_CACHE); - - dbuf_cached_state_t dcs = - dbuf_include_in_metadata_cache(db) ? - DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; - db->db_caching_status = dcs; - - multilist_insert(dbuf_caches[dcs].cache, db); - size = zfs_refcount_add_many( - &dbuf_caches[dcs].size, db->db.db_size, db); - - if (dcs == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMP(metadata_cache_count); - DBUF_STAT_MAX( - metadata_cache_size_bytes_max, - size); - } else { - DBUF_STAT_BUMP( - cache_levels[db->db_level]); - DBUF_STAT_BUMP(cache_count); - DBUF_STAT_INCR( - cache_levels_bytes[db->db_level], - db->db.db_size); - DBUF_STAT_MAX(cache_size_bytes_max, - size); - } - mutex_exit(&db->db_mtx); - - if (dcs == DB_DBUF_CACHE && !evicting) - dbuf_evict_notify(size); - } - - if (do_arc_evict) - arc_freed(spa, &bp); - } - } else { - mutex_exit(&db->db_mtx); - } - -} - -#pragma weak dmu_buf_refcount = dbuf_refcount -uint64_t -dbuf_refcount(dmu_buf_impl_t *db) -{ - return (zfs_refcount_count(&db->db_holds)); -} - -void * -dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, - dmu_buf_user_t *new_user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - mutex_enter(&db->db_mtx); - dbuf_verify_user(db, DBVU_NOT_EVICTING); - if (db->db_user == old_user) - db->db_user = new_user; - else - old_user = db->db_user; - dbuf_verify_user(db, DBVU_NOT_EVICTING); - mutex_exit(&db->db_mtx); - - return (old_user); -} - -void * -dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - return (dmu_buf_replace_user(db_fake, NULL, user)); -} - -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_user_immediate_evict = TRUE; - return (dmu_buf_set_user(db_fake, user)); -} - -void * -dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - return (dmu_buf_replace_user(db_fake, user, NULL)); -} - -void * -dmu_buf_get_user(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - dbuf_verify_user(db, DBVU_NOT_EVICTING); - return (db->db_user); -} - -void -dmu_buf_user_evict_wait() -{ - taskq_wait(dbu_evict_taskq); -} - -blkptr_t * -dmu_buf_get_blkptr(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - return (dbi->db_blkptr); -} - -objset_t * -dmu_buf_get_objset(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - return (dbi->db_objset); -} - -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - -static void -dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) -{ - /* ASSERT(dmu_tx_is_syncing(tx) */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_blkptr != NULL) - return; - - if (db->db_blkid == DMU_SPILL_BLKID) { - db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); - BP_ZERO(db->db_blkptr); - return; - } - if (db->db_level == dn->dn_phys->dn_nlevels-1) { - /* - * This buffer was allocated at a time when there was - * no available blkptrs from the dnode, or it was - * inappropriate to hook it in (i.e., nlevels mis-match). - */ - ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); - ASSERT(db->db_parent == NULL); - db->db_parent = dn->dn_dbuf; - db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; - DBUF_VERIFY(db); - } else { - dmu_buf_impl_t *parent = db->db_parent; - int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT(dn->dn_phys->dn_nlevels > 1); - if (parent == NULL) { - mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - parent = dbuf_hold_level(dn, db->db_level + 1, - db->db_blkid >> epbs, db); - rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - db->db_parent = parent; - } - db->db_blkptr = (blkptr_t *)parent->db.db_data + - (db->db_blkid & ((1ULL << epbs) - 1)); - DBUF_VERIFY(db); - } -} - -/* - * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it - * is critical the we not allow the compiler to inline this function in to - * dbuf_sync_list() thereby drastically bloating the stack usage. - */ -noinline static void -dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - - ASSERT(db->db_level > 0); - DBUF_VERIFY(db); - - /* Read the block if it hasn't been read yet. */ - if (db->db_buf == NULL) { - mutex_exit(&db->db_mtx); - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - mutex_enter(&db->db_mtx); - } - ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT(db->db_buf != NULL); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - /* Indirect block size must match what the dnode thinks it is. */ - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); - - /* Provide the pending dirty record to child dbufs */ - db->db_data_pending = dr; - - mutex_exit(&db->db_mtx); - - dbuf_write(dr, db->db_buf, tx); - - zio = dr->dr_zio; - mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_exit(&dr->dt.di.dr_mtx); - zio_nowait(zio); -} - -/* - * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is - * critical the we not allow the compiler to inline this function in to - * dbuf_sync_list() thereby drastically bloating the stack usage. - */ -noinline static void -dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - arc_buf_t **datap = &dr->dt.dl.dr_data; - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - objset_t *os; - uint64_t txg = tx->tx_txg; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. - */ - if (db->db_state == DB_UNCACHED) { - /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); - } else if (db->db_state == DB_FILL) { - /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); - } else { - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); - } - DBUF_VERIFY(db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - if (db->db_blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - /* - * In the previous transaction group, the bonus buffer - * was entirely used to store the attributes for the - * dnode which overrode the dn_spill field. However, - * when adding more attributes to the file a spill - * block was required to hold the extra attributes. - * - * Make sure to clear the garbage left in the dn_spill - * field from the previous attributes in the bonus - * buffer. Otherwise, after writing out the spill - * block to the new allocated dva, it will free - * the old block pointed to by the invalid dn_spill. - */ - db->db_blkptr = NULL; - } - dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; - mutex_exit(&dn->dn_mtx); - } - - /* - * If this is a bonus buffer, simply copy the bonus data into the - * dnode. It will be written out when the dnode is synced (and it - * will be synced, since it must have been dirty for dbuf_sync to - * be called). - */ - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - - ASSERT(*datap != NULL); - ASSERT0(db->db_level); - ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, - DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), - DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(*datap, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); - ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; - if (dr->dr_dbuf->db_level != 0) { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); - return; - } - - os = dn->dn_objset; - - /* - * This function may have dropped the db_mtx lock allowing a dmu_sync - * operation to sneak in. As a result, we need to ensure that we - * don't check the dr_override_state until we have returned from - * dbuf_check_blkptr. - */ - dbuf_check_blkptr(dn, db); - - /* - * If this buffer is in the middle of an immediate write, - * wait for the synchronous IO to complete. - */ - while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); - } - - if (db->db_state != DB_NOFILL && - dn->dn_object != DMU_META_DNODE_OBJECT && - zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - int psize = arc_buf_size(*datap); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - enum zio_compress compress_type = arc_get_compression(*datap); - - if (compress_type == ZIO_COMPRESS_OFF) { - *datap = arc_alloc_buf(os->os_spa, db, type, psize); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - int lsize = arc_buf_lsize(*datap); - *datap = arc_alloc_compressed_buf(os->os_spa, db, - psize, lsize, compress_type); - } - bcopy(db->db.db_data, (*datap)->b_data, psize); - } - db->db_data_pending = dr; - - mutex_exit(&db->db_mtx); - - dbuf_write(dr, *datap, tx); - - ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) { - list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - DB_DNODE_EXIT(db); - } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); - zio_nowait(dr->dr_zio); - } -} - -void -dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) -{ - dbuf_dirty_record_t *dr; - - while (dr = list_head(list)) { - if (dr->dr_zio != NULL) { - /* - * If we find an already initialized zio then we - * are processing the meta-dnode, and we have finished. - * The dbufs for all dnodes are put back on the list - * during processing, so that we can zio_wait() - * these IOs after initiating all child IOs. - */ - ASSERT3U(dr->dr_dbuf->db.db_object, ==, - DMU_META_DNODE_OBJECT); - break; - } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } - list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); - } -} - -/* ARGSUSED */ -static void -dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - dnode_t *dn; - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - spa_t *spa = zio->io_spa; - int64_t delta; - uint64_t fill = 0; - int i; - - ASSERT3P(db->db_blkptr, !=, NULL); - ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); - dnode_diduse_space(dn, delta - zio->io_prev_space_delta); - zio->io_prev_space_delta = delta; - - if (bp->blk_birth != 0) { - ASSERT((db->db_blkid != DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_type) || - (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype) || - BP_IS_EMBEDDED(bp)); - ASSERT(BP_GET_LEVEL(bp) == db->db_level); - } - - mutex_enter(&db->db_mtx); - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(bp)) && - db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - } -#endif - - if (db->db_level == 0) { - mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid && - db->db_blkid != DMU_SPILL_BLKID) - dn->dn_phys->dn_maxblkid = db->db_blkid; - mutex_exit(&dn->dn_mtx); - - if (dn->dn_type == DMU_OT_DNODE) { - i = 0; - while (i < db->db.db_size) { - dnode_phys_t *dnp = - (void *)(((char *)db->db.db_data) + i); - - i += DNODE_MIN_SIZE; - if (dnp->dn_type != DMU_OT_NONE) { - fill++; - i += dnp->dn_extra_slots * - DNODE_MIN_SIZE; - } - } - } else { - if (BP_IS_HOLE(bp)) { - fill = 0; - } else { - fill = 1; - } - } - } else { - blkptr_t *ibp = db->db.db_data; - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { - if (BP_IS_HOLE(ibp)) - continue; - fill += BP_GET_FILL(ibp); - } - } - DB_DNODE_EXIT(db); - - if (!BP_IS_EMBEDDED(bp)) - bp->blk_fill = fill; - - mutex_exit(&db->db_mtx); - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); -} - -/* ARGSUSED */ -/* - * This function gets called just prior to running through the compression - * stage of the zio pipeline. If we're an indirect block comprised of only - * holes, then we want this indirect to be compressed away to a hole. In - * order to do that we must zero out any information about the holes that - * this indirect points to prior to before we try to compress it. - */ -static void -dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - dnode_t *dn; - blkptr_t *bp; - unsigned int epbs, i; - - ASSERT3U(db->db_level, >, 0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(epbs, <, 31); - - /* Determine if all our children are holes */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - - /* - * If all the children are holes, then zero them all out so that - * we may get compressed away. - */ - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); - } - DB_DNODE_EXIT(db); -} - -/* - * The SPA will call this callback several times for each zio - once - * for every physical child i/o (zio->io_phys_children times). This - * allows the DMU to monitor the progress of each logical i/o. For example, - * there may be 2 copies of an indirect block, or many fragments of a RAID-Z - * block. There may be a long delay before all copies/fragments are completed, - * so this callback allows us to retire dirty space gradually, as the physical - * i/os complete. - */ -/* ARGSUSED */ -static void -dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -{ - dmu_buf_impl_t *db = arg; - objset_t *os = db->db_objset; - dsl_pool_t *dp = dmu_objset_pool(os); - dbuf_dirty_record_t *dr; - int delta = 0; - - dr = db->db_data_pending; - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dsl_pool_sync()'s call to - * dsl_pool_undirty_space(). - */ - delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - -/* ARGSUSED */ -static void -dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - blkptr_t *bp_orig = &zio->io_bp_orig; - blkptr_t *bp = db->db_blkptr; - objset_t *os = db->db_objset; - dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t **drp, *dr; - - ASSERT0(zio->io_error); - ASSERT(db->db_blkptr == bp); - - /* - * For nopwrites and rewrites we ensure that the bp matches our - * original and bypass all the accounting. - */ - if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { - ASSERT(BP_EQUAL(bp, bp_orig)); - } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); - dsl_dataset_block_born(ds, bp, tx); - } - - mutex_enter(&db->db_mtx); - - DBUF_VERIFY(db); - - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; - ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); - } -#endif - - if (db->db_level == 0) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); - } - } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(db->db_blkid, <=, - dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - } - DB_DNODE_EXIT(db); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - cv_broadcast(&db->db_changed); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); -} - -static void -dbuf_write_nofill_ready(zio_t *zio) -{ - dbuf_write_ready(zio, NULL, zio->io_private); -} - -static void -dbuf_write_nofill_done(zio_t *zio) -{ - dbuf_write_done(zio, NULL, zio->io_private); -} - -static void -dbuf_write_override_ready(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dmu_buf_impl_t *db = dr->dr_dbuf; - - dbuf_write_ready(zio, NULL, db); -} - -static void -dbuf_write_override_done(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dmu_buf_impl_t *db = dr->dr_dbuf; - blkptr_t *obp = &dr->dt.dl.dr_overridden_by; - - mutex_enter(&db->db_mtx); - if (!BP_EQUAL(zio->io_bp, obp)) { - if (!BP_IS_HOLE(obp)) - dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - arc_release(dr->dt.dl.dr_data, db); - } - mutex_exit(&db->db_mtx); - dbuf_write_done(zio, NULL, db); - - if (zio->io_abd != NULL) - abd_put(zio->io_abd); -} - -typedef struct dbuf_remap_impl_callback_arg { - objset_t *drica_os; - uint64_t drica_blk_birth; - dmu_tx_t *drica_tx; -} dbuf_remap_impl_callback_arg_t; - -static void -dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, - void *arg) -{ - dbuf_remap_impl_callback_arg_t *drica = arg; - objset_t *os = drica->drica_os; - spa_t *spa = dmu_objset_spa(os); - dmu_tx_t *tx = drica->drica_tx; - - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - if (os == spa_meta_objset(spa)) { - spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); - } else { - dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, - size, drica->drica_blk_birth, tx); - } -} - -static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) -{ - blkptr_t bp_copy = *bp; - spa_t *spa = dmu_objset_spa(dn->dn_objset); - dbuf_remap_impl_callback_arg_t drica; - - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = bp->blk_birth; - drica.drica_tx = tx; - if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, - &drica)) { - /* - * The struct_rwlock prevents dbuf_read_impl() from - * dereferencing the BP while we are changing it. To - * avoid lock contention, only grab it when we are actually - * changing the BP. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); - } -} - -/* - * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting - * to remap a copy of every bp in the dbuf. - */ -boolean_t -dbuf_can_remap(const dmu_buf_impl_t *db) -{ - spa_t *spa = dmu_objset_spa(db->db_objset); - blkptr_t *bp = db->db.db_data; - boolean_t ret = B_FALSE; - - ASSERT3U(db->db_level, >, 0); - ASSERT3S(db->db_state, ==, DB_CACHED); - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - blkptr_t bp_copy = bp[i]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - -boolean_t -dnode_needs_remap(const dnode_t *dn) -{ - spa_t *spa = dmu_objset_spa(dn->dn_objset); - boolean_t ret = B_FALSE; - - if (dn->dn_phys->dn_nlevels == 0) { - return (B_FALSE); - } - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { - blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - -/* - * Remap any existing BP's to concrete vdevs, if possible. - */ -static void -dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(db->db_objset); - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) - return; - - if (db->db_level > 0) { - blkptr_t *bp = db->db.db_data; - for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); - } - } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { - dnode_phys_t *dnp = db->db.db_data; - ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, - DMU_OT_DNODE); - for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; - i += dnp[i].dn_extra_slots + 1) { - for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); - } - } - } -} - - -/* Issue I/O to commit a dirty buffer to disk. */ -static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - objset_t *os; - dmu_buf_impl_t *parent = db->db_parent; - uint64_t txg = tx->tx_txg; - zbookmark_phys_t zb; - zio_prop_t zp; - zio_t *zio; - int wp_flag = 0; - - ASSERT(dmu_tx_is_syncing(tx)); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - os = dn->dn_objset; - - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } - } - - if (parent != dn->dn_dbuf) { - /* Our parent is an indirect block. */ - /* We have a dirty parent that has been scheduled for write. */ - ASSERT(parent && parent->db_data_pending); - /* Our parent's buffer is one level closer to the dnode. */ - ASSERT(db->db_level == parent->db_level-1); - /* - * We're about to modify our parent's db_data by modifying - * our block pointer, so the parent must be released. - */ - ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; - } else { - /* Our parent is the dnode itself. */ - ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && - db->db_blkid != DMU_SPILL_BLKID) || - (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); - if (db->db_blkid != DMU_SPILL_BLKID) - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; - } - - ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); - - SET_BOOKMARK(&zb, os->os_dsl_dataset ? - os->os_dsl_dataset->ds_object : DMU_META_OBJSET, - db->db.db_object, db->db_level, db->db_blkid); - - if (db->db_blkid == DMU_SPILL_BLKID) - wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; - - dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); - - /* - * We copy the blkptr now (rather than when we instantiate the dirty - * record), because its value can change between open context and - * syncing context. We do not need to hold dn_struct_rwlock to read - * db_blkptr because we are in syncing context. - */ - dr->dr_bp_copy = *db->db_blkptr; - - if (db->db_level == 0 && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * The BP for this block has been provided by open context - * (by dmu_sync() or dmu_buf_write_embedded()). - */ - abd_t *contents = (data != NULL) ? - abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; - - dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, - contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, - dbuf_write_override_done, - dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - mutex_enter(&db->db_mtx); - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); - mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { - ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || - zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); - dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, NULL, - dbuf_write_nofill_done, db, - ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); - } else { - ASSERT(arc_released(data)); - - /* - * For indirect blocks, we want to setup the children - * ready callback so that we can properly handle an indirect - * block that only contains holes. - */ - arc_write_done_func_t *children_ready_cb = NULL; - if (db->db_level != 0) - children_ready_cb = dbuf_write_children_ready; - - dr->dr_zio = arc_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - &zp, dbuf_write_ready, children_ready_cb, - dbuf_write_physdone, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c deleted file mode 100644 index 0a86830f71ad..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#include <sys/zfs_context.h> -#include <sys/dbuf.h> -#include <sys/dmu_objset.h> - -/* - * Calculate the index of the arc header for the state, disabled by default. - */ -int zfs_dbuf_state_index = 0; - -/* - * ========================================================================== - * Dbuf Hash Read Routines - * ========================================================================== - */ -typedef struct dbuf_stats_t { - kmutex_t lock; - kstat_t *kstat; - dbuf_hash_table_t *hash; - int idx; -} dbuf_stats_t; - -static dbuf_stats_t dbuf_stats_hash_table; - -static int -dbuf_stats_hash_table_headers(char *buf, size_t size) -{ - size = snprintf(buf, size - 1, - "%-88s | %-124s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " - "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " - "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " - "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", - "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", - "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", - "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", - "dtype", "btype", "data_bs", "meta_bs", "bsize", - "lvls", "dholds", "blocks", "dsize"); - buf[size] = '\0'; - - return (0); -} - -int -__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) -{ - arc_buf_info_t abi = { 0 }; - dmu_object_info_t doi = { 0 }; - dnode_t *dn = DB_DNODE(db); - - if (db->db_buf) - arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); - - if (dn) - __dmu_object_info_from_dnode(dn, &doi); - - size = snprintf(buf, size - 1, - "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " - "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " - "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " - "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", - /* dmu_buf_impl_t */ - spa_name(dn->dn_objset->os_spa), - (u_longlong_t)dmu_objset_id(db->db_objset), - (longlong_t)db->db.db_object, - (longlong_t)db->db_level, - (longlong_t)db->db_blkid, - (u_longlong_t)db->db.db_offset, - (u_longlong_t)db->db.db_size, - !!dbuf_is_metadata(db), - db->db_state, - (ulong_t)zfs_refcount_count(&db->db_holds), - /* arc_buf_info_t */ - abi.abi_state_type, - abi.abi_state_contents, - (longlong_t)abi.abi_state_index, - abi.abi_flags, - (ulong_t)abi.abi_bufcnt, - (u_longlong_t)abi.abi_size, - (u_longlong_t)abi.abi_access, - (ulong_t)abi.abi_mru_hits, - (ulong_t)abi.abi_mru_ghost_hits, - (ulong_t)abi.abi_mfu_hits, - (ulong_t)abi.abi_mfu_ghost_hits, - (ulong_t)abi.abi_l2arc_hits, - (u_longlong_t)abi.abi_l2arc_dattr, - (u_longlong_t)abi.abi_l2arc_asize, - abi.abi_l2arc_compress, - (ulong_t)abi.abi_holds, - /* dmu_object_info_t */ - doi.doi_type, - doi.doi_bonus_type, - (ulong_t)doi.doi_data_block_size, - (ulong_t)doi.doi_metadata_block_size, - (u_longlong_t)doi.doi_bonus_size, - (ulong_t)doi.doi_indirection, - (ulong_t)zfs_refcount_count(&dn->dn_holds), - (u_longlong_t)doi.doi_fill_count, - (u_longlong_t)doi.doi_max_offset); - buf[size] = '\0'; - - return (size); -} - -static int -dbuf_stats_hash_table_data(char *buf, size_t size, void *data) -{ - dbuf_stats_t *dsh = (dbuf_stats_t *)data; - dbuf_hash_table_t *h = dsh->hash; - dmu_buf_impl_t *db; - int length, error = 0; - - ASSERT3S(dsh->idx, >=, 0); - ASSERT3S(dsh->idx, <=, h->hash_table_mask); - memset(buf, 0, size); - - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); - for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { - /* - * Returning ENOMEM will cause the data and header functions - * to be called with a larger scratch buffers. - */ - if (size < 512) { - error = ENOMEM; - break; - } - - mutex_enter(&db->db_mtx); - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); - - length = __dbuf_stats_hash_table_data(buf, size, db); - buf += length; - size -= length; - - mutex_exit(&db->db_mtx); - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); - } - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); - - return (error); -} - -static void * -dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n) -{ - dbuf_stats_t *dsh = ksp->ks_private; - - ASSERT(MUTEX_HELD(&dsh->lock)); - - if (n <= dsh->hash->hash_table_mask) { - dsh->idx = n; - return (dsh); - } - - return (NULL); -} - -#ifndef __FreeBSD__ -/* - * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW - * we can enable this as soon as that's implemented. See the - * lindebugfs module for similar callback semantics. - */ -static void -dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -{ - dbuf_stats_t *dsh = &dbuf_stats_hash_table; - kstat_t *ksp; - - mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); - dsh->hash = hash; - - ksp = kstat_create("zfs", 0, "dbufs", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - dsh->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &dsh->lock; - ksp->ks_ndata = UINT32_MAX; - ksp->ks_private = dsh; - kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, - dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); - kstat_install(ksp); - } -} - -static void -dbuf_stats_hash_table_destroy(void) -{ - dbuf_stats_t *dsh = &dbuf_stats_hash_table; - kstat_t *ksp; - - ksp = dsh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_destroy(&dsh->lock); -} -#else -static void -dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -{ -} - -static void -dbuf_stats_hash_table_destroy(void) -{ -} -#endif - -void -dbuf_stats_init(dbuf_hash_table_t *hash) -{ - dbuf_stats_hash_table_init(hash); -} - -void -dbuf_stats_destroy(void) -{ - dbuf_stats_hash_table_destroy(); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c deleted file mode 100644 index 964aa6c054f5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ /dev/null @@ -1,1189 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/zio.h> -#include <sys/ddt.h> -#include <sys/zap.h> -#include <sys/dmu_tx.h> -#include <sys/arc.h> -#include <sys/dsl_pool.h> -#include <sys/zio_checksum.h> -#include <sys/zio_compress.h> -#include <sys/dsl_scan.h> -#include <sys/abd.h> - -/* - * Enable/disable prefetching of dedup-ed blocks which are going to be freed. - */ -int zfs_dedup_prefetch = 1; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS DEDUP"); -SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch, - 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); - -static const ddt_ops_t *ddt_ops[DDT_TYPES] = { - &ddt_zap_ops, -}; - -static const char *ddt_class_name[DDT_CLASSES] = { - "ditto", - "duplicate", - "unique", -}; - -static void -ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - spa_t *spa = ddt->ddt_spa; - objset_t *os = ddt->ddt_os; - uint64_t *objectp = &ddt->ddt_object[type][class]; - boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - ASSERT(*objectp == 0); - VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); - ASSERT(*objectp != 0); - - VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx) == 0); - - VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); -} - -static void -ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - spa_t *spa = ddt->ddt_spa; - objset_t *os = ddt->ddt_os; - uint64_t *objectp = &ddt->ddt_object[type][class]; - uint64_t count; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - ASSERT(*objectp != 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); - ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); - VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); - VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); - VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); - bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); - - *objectp = 0; -} - -static int -ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; - dmu_object_info_t doi; - uint64_t count; - char name[DDT_NAMELEN]; - int error; - - ddt_object_name(ddt, type, class, name); - - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); - - if (error != 0) - return (error); - - VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class])); - - /* - * Seed the cached statistics. - */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - - error = ddt_object_count(ddt, type, class, &count); - if (error) - return error; - - ddo->ddo_count = count; - ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; - ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; - - return (0); -} - -static void -ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; - dmu_object_info_t doi; - uint64_t count; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); - - /* - * Cache DDT statistics; this is the only time they'll change. - */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0); - - ddo->ddo_count = count; - ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; - ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; -} - -static int -ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde) -{ - if (!ddt_object_exists(ddt, type, class)) - return (SET_ERROR(ENOENT)); - - return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, - ddt->ddt_object[type][class], dde)); -} - -static void -ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde) -{ - if (!ddt_object_exists(ddt, type, class)) - return; - - ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, - ddt->ddt_object[type][class], dde); -} - -int -ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, dmu_tx_t *tx) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); -} - -static int -ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, dmu_tx_t *tx) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); -} - -int -ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - uint64_t *walk, ddt_entry_t *dde) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], dde, walk)); -} - -int -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, - ddt->ddt_object[type][class], count)); -} - -int -ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_object_info_t *doi) -{ - if (!ddt_object_exists(ddt, type, class)) - return (SET_ERROR(ENOENT)); - - return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], - doi)); -} - -boolean_t -ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - return (!!ddt->ddt_object[type][class]); -} - -void -ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - char *name) -{ - (void) sprintf(name, DMU_POOL_DDT, - zio_checksum_table[ddt->ddt_checksum].ci_name, - ddt_ops[type]->ddt_op_name, ddt_class_name[class]); -} - -void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) -{ - ASSERT(txg != 0); - - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); -} - -void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) -{ - BP_ZERO(bp); - - if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); - - bp->blk_cksum = ddk->ddk_cksum; - bp->blk_fill = 1; - - BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); - BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); - BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); - BP_SET_CHECKSUM(bp, checksum); - BP_SET_TYPE(bp, DMU_OT_DEDUP); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); -} - -void -ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) -{ - ddk->ddk_cksum = bp->blk_cksum; - ddk->ddk_prop = 0; - - DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); -} - -void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) -{ - ASSERT(ddp->ddp_phys_birth == 0); - - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); -} - -void -ddt_phys_clear(ddt_phys_t *ddp) -{ - bzero(ddp, sizeof (*ddp)); -} |