aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2021-07-07 21:31:10 +0000
committerMartin Matuska <mm@FreeBSD.org>2021-07-07 21:31:52 +0000
commit7cd22ac43418da08448d0bab1009ff3cbda85120 (patch)
tree4f4d022512de4ea10d4556db0ff179970576c21b /sys/contrib/openzfs/module
parentf68e3ea831b76a8927eed7f7abfea55ee5a193c4 (diff)
parentbdd11cbb90a2afa54fd00935ac0d34b4ddf2515c (diff)
downloadsrc-7cd22ac43418da08448d0bab1009ff3cbda85120.tar.gz
src-7cd22ac43418da08448d0bab1009ff3cbda85120.zip
zfs: merge openzfs/zfs@bdd11cbb9 (master) into main
Notable upstream pull request merges: #12274 Optimize txg_kick() process #12281 Move gethrtime() calls out of vdev queue lock #12287 Remove refcount from spa_config_*( #12289 Compact dbuf/buf hashes and lock arrays #12290 Remove avl_size field from struct avl_tree #12294 Upstream: dmu_zfetch_stream_fini leaks refcount #12295 Fix abd leak, kmem_free correct size of abd_t #12328 FreeBSD: Hardcode abd_chunk_size to PAGE_SIZE Obtained from: OpenZFS OpenZFS commit: bdd11cbb90a2afa54fd00935ac0d34b4ddf2515c
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/avl/avl.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c136
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/abd.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c25
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_pool.c25
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c19
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c36
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_queue.c11
11 files changed, 117 insertions, 150 deletions
diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c
index 1a95092bc2b6..3d36d4c87e0b 100644
--- a/sys/contrib/openzfs/module/avl/avl.c
+++ b/sys/contrib/openzfs/module/avl/avl.c
@@ -875,7 +875,6 @@ avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
- ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
temp_node = tree1->avl_root;
temp_numnodes = tree1->avl_numnodes;
@@ -903,7 +902,6 @@ avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
tree->avl_compar = compar;
tree->avl_root = NULL;
tree->avl_numnodes = 0;
- tree->avl_size = size;
tree->avl_offset = offset;
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
index 47adc2278df2..95a83542fadc 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -79,22 +79,29 @@ struct {
} abd_sums;
/*
- * The size of the chunks ABD allocates. Because the sizes allocated from the
- * kmem_cache can't change, this tunable can only be modified at boot. Changing
- * it at runtime would cause ABD iteration to work incorrectly for ABDs which
- * were allocated with the old size, so a safeguard has been put in place which
- * will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's for. Smaller allocations will use linear ABD's which use
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page). Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.
+ *
+ * Linear ABDs for multi-page allocations are easier to use, and in some cases
+ * it allows to avoid buffer copying. But allocation and especially free
+ * of multi-page linear ABDs are expensive operations due to KVA mapping and
+ * unmapping, and with time they cause KVA fragmentations.
*/
-size_t zfs_abd_chunk_size = 4096;
+size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
#if defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
- &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
+ &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
#endif
kmem_cache_t *abd_chunk_cache;
@@ -102,23 +109,16 @@ static kstat_t *abd_ksp;
/*
* We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
- * just a single zero'd sized zfs_abd_chunk_size buffer. This
- * allows us to conserve memory by only using a single zero buffer
- * for the scatter chunks.
+ * just a single zero'd page-sized buffer. This allows us to conserve
+ * memory by only using a single zero buffer for the scatter chunks.
*/
abd_t *abd_zero_scatter = NULL;
static char *abd_zero_buf = NULL;
-static void
-abd_free_chunk(void *c)
-{
- kmem_cache_free(abd_chunk_cache, c);
-}
-
static uint_t
abd_chunkcnt_for_bytes(size_t size)
{
- return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+ return ((size + PAGE_MASK) >> PAGE_SHIFT);
}
static inline uint_t
@@ -132,7 +132,7 @@ abd_scatter_chunkcnt(abd_t *abd)
boolean_t
abd_size_alloc_linear(size_t size)
{
- return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
+ return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
}
void
@@ -140,7 +140,7 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
uint_t n = abd_scatter_chunkcnt(abd);
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
- int waste = n * zfs_abd_chunk_size - abd->abd_size;
+ int waste = (n << PAGE_SHIFT) - abd->abd_size;
if (op == ABDSTAT_INCR) {
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
@@ -173,11 +173,11 @@ abd_verify_scatter(abd_t *abd)
uint_t i, n;
/*
- * There is no scatter linear pages in FreeBSD so there is an
- * if an error if the ABD has been marked as a linear page.
+ * There is no scatter linear pages in FreeBSD so there is
+ * an error if the ABD has been marked as a linear page.
*/
ASSERT(!abd_is_linear_page(abd));
- ASSERT3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size);
+ ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE);
n = abd_scatter_chunkcnt(abd);
for (i = 0; i < n; i++) {
ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
@@ -191,11 +191,9 @@ abd_alloc_chunks(abd_t *abd, size_t size)
n = abd_chunkcnt_for_bytes(size);
for (i = 0; i < n; i++) {
- void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
- ASSERT3P(c, !=, NULL);
- ABD_SCATTER(abd).abd_chunks[i] = c;
+ ABD_SCATTER(abd).abd_chunks[i] =
+ kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
}
- ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
}
void
@@ -205,7 +203,8 @@ abd_free_chunks(abd_t *abd)
n = abd_scatter_chunkcnt(abd);
for (i = 0; i < n; i++) {
- abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
+ kmem_cache_free(abd_chunk_cache,
+ ABD_SCATTER(abd).abd_chunks[i]);
}
}
@@ -250,15 +249,13 @@ abd_alloc_zero_scatter(void)
uint_t i, n;
n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
- abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
+ abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
- ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
- zfs_abd_chunk_size;
for (i = 0; i < n; i++) {
ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
@@ -266,18 +263,18 @@ abd_alloc_zero_scatter(void)
}
ABDSTAT_BUMP(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
+ ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE);
}
static void
abd_free_zero_scatter(void)
{
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE);
abd_free_struct(abd_zero_scatter);
abd_zero_scatter = NULL;
- kmem_free(abd_zero_buf, zfs_abd_chunk_size);
+ kmem_cache_free(abd_chunk_cache, abd_zero_buf);
}
static int
@@ -305,7 +302,7 @@ abd_kstats_update(kstat_t *ksp, int rw)
void
abd_init(void)
{
- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0,
NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
wmsum_init(&abd_sums.abdstat_struct_size, 0);
@@ -374,14 +371,17 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
}
abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+ size_t size)
{
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
- uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
- (new_offset / zfs_abd_chunk_size);
+ size_t chunkcnt = abd_chunkcnt_for_bytes(
+ (new_offset & PAGE_MASK) + size);
+
+ ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd));
/*
* If an abd struct is provided, it is only the minimum size. If we
@@ -394,7 +394,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
}
if (abd == NULL)
- abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
+ abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
/*
* Even if this buf is filesystem metadata, we only track that
@@ -402,34 +402,16 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
- ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
- ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+ ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
/* Copy the scatterlist starting at the correct offset */
(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
- &ABD_SCATTER(sabd).abd_chunks[new_offset /
- zfs_abd_chunk_size],
+ &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
chunkcnt * sizeof (void *));
return (abd);
}
-static inline size_t
-abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
-{
- ASSERT(!abd_is_linear(aiter->iter_abd));
- return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
- aiter->iter_pos) % zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_iter_scatter_chunk_index(struct abd_iter *aiter)
-{
- ASSERT(!abd_is_linear(aiter->iter_abd));
- return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
- aiter->iter_pos) / zfs_abd_chunk_size);
-}
-
/*
* Initialize the abd_iter.
*/
@@ -480,29 +462,25 @@ void
abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
- size_t offset = 0;
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
- /* Panic if someone has changed zfs_abd_chunk_size */
- IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
- ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
-
/* There's nothing left to iterate over, so do nothing */
if (abd_iter_at_end(aiter))
return;
- if (abd_is_linear(aiter->iter_abd)) {
- offset = aiter->iter_pos;
- aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
- paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+ abd_t *abd = aiter->iter_abd;
+ size_t offset = aiter->iter_pos;
+ if (abd_is_linear(abd)) {
+ aiter->iter_mapsize = abd->abd_size - offset;
+ paddr = ABD_LINEAR_BUF(abd);
} else {
- size_t index = abd_iter_scatter_chunk_index(aiter);
- offset = abd_iter_scatter_chunk_offset(aiter);
- aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
- aiter->iter_abd->abd_size - aiter->iter_pos);
- paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
+ offset += ABD_SCATTER(abd).abd_offset;
+ paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
+ offset &= PAGE_MASK;
+ aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
+ abd->abd_size - aiter->iter_pos);
}
aiter->iter_mapaddr = (char *)paddr + offset;
}
@@ -514,12 +492,10 @@ abd_iter_map(struct abd_iter *aiter)
void
abd_iter_unmap(struct abd_iter *aiter)
{
- /* There's nothing left to unmap, so do nothing */
- if (abd_iter_at_end(aiter))
- return;
-
- ASSERT3P(aiter->iter_mapaddr, !=, NULL);
- ASSERT3U(aiter->iter_mapsize, >, 0);
+ if (!abd_iter_at_end(aiter)) {
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+ }
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index af543d6e3f7e..d1d238a4e303 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -835,7 +835,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
}
abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+ size_t size)
{
int i = 0;
struct scatterlist *sg = NULL;
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
index d5fafccd08af..cc2d3575db63 100644
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -531,7 +531,7 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
}
ASSERT3U(left, ==, 0);
} else {
- abd = abd_get_offset_scatter(abd, sabd, off);
+ abd = abd_get_offset_scatter(abd, sabd, off, size);
}
ASSERT3P(abd, !=, NULL);
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 3484fff3b4d4..394ca1bfe42d 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -740,29 +740,18 @@ taskq_t *arc_prune_taskq;
* Hash table routines
*/
-#define HT_LOCK_ALIGN 64
-#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
-
-struct ht_lock {
- kmutex_t ht_lock;
-#ifdef _KERNEL
- unsigned char pad[HT_LOCK_PAD];
-#endif
-};
-
-#define BUF_LOCKS 8192
+#define BUF_LOCKS 2048
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
- struct ht_lock ht_locks[BUF_LOCKS];
+ kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
@@ -1111,7 +1100,7 @@ buf_fini(void)
(buf_hash_table.ht_mask + 1) * sizeof (void *));
#endif
for (i = 0; i < BUF_LOCKS; i++)
- mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ mutex_destroy(BUF_HASH_LOCK(i));
kmem_cache_destroy(hdr_full_cache);
kmem_cache_destroy(hdr_full_crypt_cache);
kmem_cache_destroy(hdr_l2only_cache);
@@ -1276,10 +1265,8 @@ retry:
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
- for (i = 0; i < BUF_LOCKS; i++) {
- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
}
#define ARC_MINTIME (hz>>4) /* 62 ms */
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 9ce091b80dcb..289247c6ed65 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -826,12 +826,12 @@ dbuf_init(void)
int i;
/*
- * The hash table is big enough to fill all of physical memory
+ * The hash table is big enough to fill one eighth of physical memory
* with an average block size of zfs_arc_average_blocksize (default 8K).
* By default, the table will take up
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
*/
- while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+ while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
hsize <<= 1;
retry:
@@ -3055,8 +3055,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_state = DB_EVICTING; /* not worth logging this state change */
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
- kmem_cache_free(dbuf_kmem_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
+ kmem_cache_free(dbuf_kmem_cache, db);
DBUF_STAT_BUMP(hash_insert_race);
return (odb);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 4a323fa990fe..a26b0d739921 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -159,6 +159,8 @@ static void
dmu_zfetch_stream_fini(zstream_t *zs)
{
ASSERT(!list_link_active(&zs->zs_node));
+ zfs_refcount_destroy(&zs->zs_callers);
+ zfs_refcount_destroy(&zs->zs_refs);
kmem_free(zs, sizeof (*zs));
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
index e66c136a9e02..72f4b86d772e 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_pool.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -898,18 +898,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
- uint64_t dirty_min_bytes =
- zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
- uint64_t dirty;
mutex_enter(&dp->dp_lock);
- dirty = dp->dp_dirty_total;
+ uint64_t dirty = dp->dp_dirty_total;
mutex_exit(&dp->dp_lock);
- if (dirty > dirty_min_bytes)
- txg_kick(dp);
+
return (dirty > delay_min_bytes);
}
+static boolean_t
+dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+ uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+ return (dirty > dirty_min_bytes);
+}
+
void
dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
{
@@ -917,7 +925,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
mutex_enter(&dp->dp_lock);
dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
dsl_pool_dirty_delta(dp, space);
+ boolean_t needsync = !dmu_tx_is_syncing(tx) &&
+ dsl_pool_need_dirty_sync(dp, tx->tx_txg);
mutex_exit(&dp->dp_lock);
+
+ if (needsync)
+ txg_kick(dp, tx->tx_txg);
}
}
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index e2523231d280..157dede93cfc 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -444,9 +444,9 @@ spa_config_lock_init(spa_t *spa)
spa_config_lock_t *scl = &spa->spa_config_lock[i];
mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
- zfs_refcount_create_untracked(&scl->scl_count);
scl->scl_writer = NULL;
scl->scl_write_wanted = 0;
+ scl->scl_count = 0;
}
}
@@ -457,9 +457,9 @@ spa_config_lock_destroy(spa_t *spa)
spa_config_lock_t *scl = &spa->spa_config_lock[i];
mutex_destroy(&scl->scl_lock);
cv_destroy(&scl->scl_cv);
- zfs_refcount_destroy(&scl->scl_count);
ASSERT(scl->scl_writer == NULL);
ASSERT(scl->scl_write_wanted == 0);
+ ASSERT(scl->scl_count == 0);
}
}
@@ -480,7 +480,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
}
} else {
ASSERT(scl->scl_writer != curthread);
- if (!zfs_refcount_is_zero(&scl->scl_count)) {
+ if (scl->scl_count != 0) {
mutex_exit(&scl->scl_lock);
spa_config_exit(spa, locks & ((1 << i) - 1),
tag);
@@ -488,7 +488,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
}
scl->scl_writer = curthread;
}
- (void) zfs_refcount_add(&scl->scl_count, tag);
+ scl->scl_count++;
mutex_exit(&scl->scl_lock);
}
return (1);
@@ -514,14 +514,14 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
}
} else {
ASSERT(scl->scl_writer != curthread);
- while (!zfs_refcount_is_zero(&scl->scl_count)) {
+ while (scl->scl_count != 0) {
scl->scl_write_wanted++;
cv_wait(&scl->scl_cv, &scl->scl_lock);
scl->scl_write_wanted--;
}
scl->scl_writer = curthread;
}
- (void) zfs_refcount_add(&scl->scl_count, tag);
+ scl->scl_count++;
mutex_exit(&scl->scl_lock);
}
ASSERT3U(wlocks_held, <=, locks);
@@ -535,8 +535,8 @@ spa_config_exit(spa_t *spa, int locks, const void *tag)
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
- ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
- if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
+ ASSERT(scl->scl_count > 0);
+ if (--scl->scl_count == 0) {
ASSERT(scl->scl_writer == NULL ||
scl->scl_writer == curthread);
scl->scl_writer = NULL; /* OK in either case */
@@ -555,8 +555,7 @@ spa_config_held(spa_t *spa, int locks, krw_t rw)
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (!(locks & (1 << i)))
continue;
- if ((rw == RW_READER &&
- !zfs_refcount_is_zero(&scl->scl_count)) ||
+ if ((rw == RW_READER && scl->scl_count != 0) ||
(rw == RW_WRITER && scl->scl_writer == curthread))
locks_held |= 1 << i;
}
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index c55b1d8f9601..c9eb84bbdb12 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -499,14 +499,6 @@ txg_wait_callbacks(dsl_pool_t *dp)
}
static boolean_t
-txg_is_syncing(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
- return (tx->tx_syncing_txg != 0);
-}
-
-static boolean_t
txg_is_quiescing(dsl_pool_t *dp)
{
tx_state_t *tx = &dp->dp_tx;
@@ -539,8 +531,6 @@ txg_sync_thread(void *arg)
clock_t timeout = zfs_txg_timeout * hz;
clock_t timer;
uint64_t txg;
- uint64_t dirty_min_bytes =
- zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
/*
* We sync when we're scanning, there's someone waiting
@@ -551,8 +541,7 @@ txg_sync_thread(void *arg)
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- !txg_has_quiesced_to_sync(dp) &&
- dp->dp_dirty_total < dirty_min_bytes) {
+ !txg_has_quiesced_to_sync(dp)) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
(u_longlong_t)tx->tx_synced_txg,
(u_longlong_t)tx->tx_sync_txg_waiting, dp);
@@ -566,6 +555,11 @@ txg_sync_thread(void *arg)
* prompting it to do so if necessary.
*/
while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+ if (txg_is_quiescing(dp)) {
+ txg_thread_wait(tx, &cpr,
+ &tx->tx_quiesce_done_cv, 0);
+ continue;
+ }
if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
}
/*
- * If there isn't a txg syncing or in the pipeline, push another txg through
- * the pipeline by quiescing the open txg.
+ * Pass in the txg number that should be synced.
*/
void
-txg_kick(dsl_pool_t *dp)
+txg_kick(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
ASSERT(!dsl_pool_config_held(dp));
+ if (tx->tx_sync_txg_waiting >= txg)
+ return;
+
mutex_enter(&tx->tx_sync_lock);
- if (!txg_is_syncing(dp) &&
- !txg_is_quiescing(dp) &&
- tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
- tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
- tx->tx_quiesced_txg <= tx->tx_synced_txg) {
- tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
- cv_broadcast(&tx->tx_quiesce_more_cv);
+ if (tx->tx_sync_txg_waiting < txg) {
+ tx->tx_sync_txg_waiting = txg;
+ cv_broadcast(&tx->tx_sync_more_cv);
}
mutex_exit(&tx->tx_sync_lock);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index 198861edb816..06d22f6df4c5 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -912,9 +912,9 @@ vdev_queue_io(zio_t *zio)
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+ zio->io_timestamp = gethrtime();
mutex_enter(&vq->vq_lock);
- zio->io_timestamp = gethrtime();
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
mutex_exit(&vq->vq_lock);
@@ -936,14 +936,13 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- mutex_enter(&vq->vq_lock);
+ hrtime_t now = gethrtime();
+ vq->vq_io_complete_ts = now;
+ vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
+ mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);
- zio->io_delta = gethrtime() - zio->io_timestamp;
- vq->vq_io_complete_ts = gethrtime();
- vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
-
while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
mutex_exit(&vq->vq_lock);
if (nio->io_done == vdev_queue_agg_io_done) {