aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts/common/fs
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2020-08-25 02:21:27 +0000
committerMatt Macy <mmacy@FreeBSD.org>2020-08-25 02:21:27 +0000
commit9e5787d2284e187abb5b654d924394a65772e004 (patch)
tree2ebf833af6b1953d4a683e2da830fe87bf3435e1 /sys/cddl/contrib/opensolaris/uts/common/fs
parent22df1ffd812f0395cdb7c0b1edae1f67b991562a (diff)
downloadsrc-9e5787d2284e187abb5b654d924394a65772e004.tar.gz
src-9e5787d2284e187abb5b654d924394a65772e004.zip
Merge OpenZFS support in to HEAD.
The primary benefit is maintaining a completely shared code base with the community allowing FreeBSD to receive new features sooner and with less effort. I would advise against doing 'zpool upgrade' or creating indispensable pools using new features until this change has had a month+ to soak. Work on merging FreeBSD support in to what was at the time "ZFS on Linux" began in August 2018. I first publicly proposed transitioning FreeBSD to (new) OpenZFS on December 18th, 2018. FreeBSD support in OpenZFS was finally completed in December 2019. A CFT for downstreaming OpenZFS support in to FreeBSD was first issued on July 8th. All issues that were reported have been addressed or, for a couple of less critical matters there are pull requests in progress with OpenZFS. iXsystems has tested and dogfooded extensively internally. The TrueNAS 12 release is based on OpenZFS with some additional features that have not yet made it upstream. Improvements include: project quotas, encrypted datasets, allocation classes, vectorized raidz, vectorized checksums, various command line improvements, zstd compression. Thanks to those who have helped along the way: Ryan Moeller, Allan Jude, Zack Welch, and many others. Sponsored by: iXsystems, Inc. Differential Revision: https://reviews.freebsd.org/D25872
Notes
Notes: svn path=/head/; revision=364746
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c94
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz430
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c960
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c234
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c8569
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c606
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c301
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c4248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c242
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c1189
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c165
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c2748
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c251
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c444
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c2484
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c3550
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c712
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c1345
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c374
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c2418
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c779
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c566
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c4252
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c561
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c760
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c1097
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c2184
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c1372
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c1211
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c4001
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c667
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c1283
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c791
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h176
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c296
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c212
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c885
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c154
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c607
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c691
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c173
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c161
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c1220
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h157
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c529
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h78
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h308
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c99
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c283
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h606
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c107
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h288
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c1637
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c321
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h228
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c185
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c1050
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c589
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c284
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h443
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h555
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c258
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c930
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h44
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c129
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c4624
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c750
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c423
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c670
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c321
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c396
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c2012
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c8972
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c623
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c594
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c406
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c628
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c2523
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c1073
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c149
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h154
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h58
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h290
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h417
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h1028
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h315
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h221
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h599
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h457
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h81
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h68
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h209
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h191
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h115
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h188
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h127
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h127
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h501
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h107
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h124
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h170
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h291
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h969
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h48
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h44
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h435
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h230
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h136
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h145
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h196
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h67
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h49
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h571
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h141
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h514
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h242
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h185
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h146
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h99
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h466
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h90
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h142
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h192
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h374
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h464
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h229
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h675
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h128
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h85
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c634
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c977
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c4520
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c434
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c971
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c307
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c1193
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c1849
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c212
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c593
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c782
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1701
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c779
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c113
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c1047
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c2707
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c2156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c157
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c1378
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c849
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c1609
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c1432
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c865
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c531
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c360
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c505
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c2778
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c199
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c1364
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c968
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c871
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c762
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c7692
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c688
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c254
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c1069
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c641
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c326
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c2813
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c6124
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c2388
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c3499
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c4386
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c475
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c215
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c755
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c86
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c187
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c431
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c3347
257 files changed, 0 insertions, 178575 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
deleted file mode 100644
index 6d82470d220a..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/proc.h>
-#include <sys/taskq.h>
-#include <sys/vnode.h>
-
-/* Extensible attribute (xva) routines. */
-
-/*
- * Zero out the structure, set the size of the requested/returned bitmaps,
- * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
- * to the returned attributes array.
- */
-void
-xva_init(xvattr_t *xvap)
-{
- bzero(xvap, sizeof (xvattr_t));
- xvap->xva_mapsize = XVA_MAPSIZE;
- xvap->xva_magic = XVA_MAGIC;
- xvap->xva_vattr.va_mask = AT_XVATTR;
- xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
-}
-
-/*
- * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
- * structure. Otherwise, returns NULL.
- */
-xoptattr_t *
-xva_getxoptattr(xvattr_t *xvap)
-{
- xoptattr_t *xoap = NULL;
- if (xvap->xva_vattr.va_mask & AT_XVATTR)
- xoap = &xvap->xva_xoptattrs;
- return (xoap);
-}
-
-/*
- * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
- * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
- * the file system as a result of releasing the vnode. Note, file systems
- * already have to handle the race where the vnode is incremented before the
- * inactive routine is called and does its locking.
- *
- * Warning: Excessive use of this routine can lead to performance problems.
- * This is because taskqs throttle back allocation if too many are created.
- */
-void
-vn_rele_async(vnode_t *vp, taskq_t *taskq)
-{
- VERIFY(vp->v_count > 0);
- if (refcount_release_if_not_last(&vp->v_usecount)) {
- return;
- }
- VERIFY(taskq_dispatch((taskq_t *)taskq,
- (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
deleted file mode 100644
index e558b2a50358..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2011 Google, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
deleted file mode 100644
index f98cb76dfc91..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
+++ /dev/null
@@ -1 +0,0 @@
-CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
deleted file mode 100644
index 722cc75f01e9..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
+++ /dev/null
@@ -1,30 +0,0 @@
-LZ4 - Fast LZ compression algorithm
-Copyright (C) 2011-2013, Yann Collet.
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
-- LZ4 source repository : http://code.google.com/p/lz4/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
deleted file mode 100644
index 211f679b5749..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
+++ /dev/null
@@ -1 +0,0 @@
-LZ4 COMPRESSION FUNCTIONALITY IN ZFS
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
deleted file mode 100644
index 1843c8161038..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
+++ /dev/null
@@ -1,960 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-/*
- * ARC buffer data (ABD).
- *
- * ABDs are an abstract data structure for the ARC which can use two
- * different ways of storing the underlying data:
- *
- * (a) Linear buffer. In this case, all the data in the ABD is stored in one
- * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
- *
- * +-------------------+
- * | ABD (linear) |
- * | abd_flags = ... |
- * | abd_size = ... | +--------------------------------+
- * | abd_buf ------------->| raw buffer of size abd_size |
- * +-------------------+ +--------------------------------+
- * no abd_chunks
- *
- * (b) Scattered buffer. In this case, the data in the ABD is split into
- * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
- * to the chunks recorded in an array at the end of the ABD structure.
- *
- * +-------------------+
- * | ABD (scattered) |
- * | abd_flags = ... |
- * | abd_size = ... |
- * | abd_offset = 0 | +-----------+
- * | abd_chunks[0] ----------------------------->| chunk 0 |
- * | abd_chunks[1] ---------------------+ +-----------+
- * | ... | | +-----------+
- * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
- * +-------------------+ | +-----------+
- * | ...
- * | +-----------+
- * +----------------->| chunk N-1 |
- * +-----------+
- *
- * Using a large proportion of scattered ABDs decreases ARC fragmentation since
- * when we are at the limit of allocatable space, using equal-size chunks will
- * allow us to quickly reclaim enough space for a new large allocation (assuming
- * it is also scattered).
- *
- * In addition to directly allocating a linear or scattered ABD, it is also
- * possible to create an ABD by requesting the "sub-ABD" starting at an offset
- * within an existing ABD. In linear buffers this is simple (set abd_buf of
- * the new ABD to the starting point within the original raw buffer), but
- * scattered ABDs are a little more complex. The new ABD makes a copy of the
- * relevant abd_chunks pointers (but not the underlying data). However, to
- * provide arbitrary rather than only chunk-aligned starting offsets, it also
- * tracks an abd_offset field which represents the starting point of the data
- * within the first chunk in abd_chunks. For both linear and scattered ABDs,
- * creating an offset ABD marks the original ABD as the offset's parent, and the
- * original ABD's abd_children refcount is incremented. This data allows us to
- * ensure the root ABD isn't deleted before its children.
- *
- * Most consumers should never need to know what type of ABD they're using --
- * the ABD public API ensures that it's possible to transparently switch from
- * using a linear ABD to a scattered one when doing so would be beneficial.
- *
- * If you need to use the data within an ABD directly, if you know it's linear
- * (because you allocated it) you can use abd_to_buf() to access the underlying
- * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
- * which will allocate a raw buffer if necessary. Use the abd_return_buf*
- * functions to return any raw buffers that are no longer necessary when you're
- * done using them.
- *
- * There are a variety of ABD APIs that implement basic buffer operations:
- * compare, copy, read, write, and fill with zeroes. If you need a custom
- * function which progressively accesses the whole ABD, use the abd_iterate_*
- * functions.
- */
-
-#include <sys/abd.h>
-#include <sys/param.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_znode.h>
-
-typedef struct abd_stats {
- kstat_named_t abdstat_struct_size;
- kstat_named_t abdstat_scatter_cnt;
- kstat_named_t abdstat_scatter_data_size;
- kstat_named_t abdstat_scatter_chunk_waste;
- kstat_named_t abdstat_linear_cnt;
- kstat_named_t abdstat_linear_data_size;
-} abd_stats_t;
-
-static abd_stats_t abd_stats = {
- /* Amount of memory occupied by all of the abd_t struct allocations */
- { "struct_size", KSTAT_DATA_UINT64 },
- /*
- * The number of scatter ABDs which are currently allocated, excluding
- * ABDs which don't own their data (for instance the ones which were
- * allocated through abd_get_offset()).
- */
- { "scatter_cnt", KSTAT_DATA_UINT64 },
- /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
- { "scatter_data_size", KSTAT_DATA_UINT64 },
- /*
- * The amount of space wasted at the end of the last chunk across all
- * scatter ABDs tracked by scatter_cnt.
- */
- { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
- /*
- * The number of linear ABDs which are currently allocated, excluding
- * ABDs which don't own their data (for instance the ones which were
- * allocated through abd_get_offset() and abd_get_from_buf()). If an
- * ABD takes ownership of its buf then it will become tracked.
- */
- { "linear_cnt", KSTAT_DATA_UINT64 },
- /* Amount of data stored in all linear ABDs tracked by linear_cnt */
- { "linear_data_size", KSTAT_DATA_UINT64 },
-};
-
-#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
-#define ABDSTAT_INCR(stat, val) \
- atomic_add_64(&abd_stats.stat.value.ui64, (val))
-#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
-#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
-
-/*
- * It is possible to make all future ABDs be linear by setting this to B_FALSE.
- * Otherwise, ABDs are allocated scattered by default unless the caller uses
- * abd_alloc_linear().
- */
-boolean_t zfs_abd_scatter_enabled = B_TRUE;
-
-/*
- * The size of the chunks ABD allocates. Because the sizes allocated from the
- * kmem_cache can't change, this tunable can only be modified at boot. Changing
- * it at runtime would cause ABD iteration to work incorrectly for ABDs which
- * were allocated with the old size, so a safeguard has been put in place which
- * will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
- */
-size_t zfs_abd_chunk_size = 4096;
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-SYSCTL_DECL(_vfs_zfs);
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
- &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
- &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
-#endif
-
-#ifdef _KERNEL
-extern vmem_t *zio_alloc_arena;
-#endif
-
-kmem_cache_t *abd_chunk_cache;
-static kstat_t *abd_ksp;
-
-extern inline boolean_t abd_is_linear(abd_t *abd);
-extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
-extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
-extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
-extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
-extern inline void abd_zero(abd_t *abd, size_t size);
-
-static void *
-abd_alloc_chunk()
-{
- void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
- ASSERT3P(c, !=, NULL);
- return (c);
-}
-
-static void
-abd_free_chunk(void *c)
-{
- kmem_cache_free(abd_chunk_cache, c);
-}
-
-void
-abd_init(void)
-{
-#ifdef illumos
- vmem_t *data_alloc_arena = NULL;
-
-#ifdef _KERNEL
- data_alloc_arena = zio_alloc_arena;
-#endif
-
- /*
- * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
- * so that no allocator metadata is stored with the buffers.
- */
- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
- NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
-#else
- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
- NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
-#endif
- abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
- sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
- if (abd_ksp != NULL) {
- abd_ksp->ks_data = &abd_stats;
- kstat_install(abd_ksp);
- }
-}
-
-void
-abd_fini(void)
-{
- if (abd_ksp != NULL) {
- kstat_delete(abd_ksp);
- abd_ksp = NULL;
- }
-
- kmem_cache_destroy(abd_chunk_cache);
- abd_chunk_cache = NULL;
-}
-
-static inline size_t
-abd_chunkcnt_for_bytes(size_t size)
-{
- return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_scatter_chunkcnt(abd_t *abd)
-{
- ASSERT(!abd_is_linear(abd));
- return (abd_chunkcnt_for_bytes(
- abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
-}
-
-static inline void
-abd_verify(abd_t *abd)
-{
- ASSERT3U(abd->abd_size, >, 0);
- ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
- ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
- ABD_FLAG_OWNER | ABD_FLAG_META));
- IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
- IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
- if (abd_is_linear(abd)) {
- ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
- } else {
- ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
- zfs_abd_chunk_size);
- size_t n = abd_scatter_chunkcnt(abd);
- for (int i = 0; i < n; i++) {
- ASSERT3P(
- abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
- }
- }
-}
-
-static inline abd_t *
-abd_alloc_struct(size_t chunkcnt)
-{
- size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
- abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
- ASSERT3P(abd, !=, NULL);
- ABDSTAT_INCR(abdstat_struct_size, size);
-
- return (abd);
-}
-
-static inline void
-abd_free_struct(abd_t *abd)
-{
- size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
- int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
- kmem_free(abd, size);
- ABDSTAT_INCR(abdstat_struct_size, -size);
-}
-
-/*
- * Allocate an ABD, along with its own underlying data buffers. Use this if you
- * don't care whether the ABD is linear or not.
- */
-abd_t *
-abd_alloc(size_t size, boolean_t is_metadata)
-{
- if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size)
- return (abd_alloc_linear(size, is_metadata));
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- size_t n = abd_chunkcnt_for_bytes(size);
- abd_t *abd = abd_alloc_struct(n);
-
- abd->abd_flags = ABD_FLAG_OWNER;
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- abd->abd_u.abd_scatter.abd_offset = 0;
- abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
-
- for (int i = 0; i < n; i++) {
- void *c = abd_alloc_chunk();
- ASSERT3P(c, !=, NULL);
- abd->abd_u.abd_scatter.abd_chunks[i] = c;
- }
-
- ABDSTAT_BUMP(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, size);
- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
- n * zfs_abd_chunk_size - size);
-
- return (abd);
-}
-
-static void
-abd_free_scatter(abd_t *abd)
-{
- size_t n = abd_scatter_chunkcnt(abd);
- for (int i = 0; i < n; i++) {
- abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
- }
-
- zfs_refcount_destroy(&abd->abd_children);
- ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
- abd->abd_size - n * zfs_abd_chunk_size);
-
- abd_free_struct(abd);
-}
-
-/*
- * Allocate an ABD that must be linear, along with its own underlying data
- * buffer. Only use this when it would be very annoying to write your ABD
- * consumer with a scattered ABD.
- */
-abd_t *
-abd_alloc_linear(size_t size, boolean_t is_metadata)
-{
- abd_t *abd = abd_alloc_struct(0);
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- if (is_metadata) {
- abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
- } else {
- abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
- }
-
- ABDSTAT_BUMP(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, size);
-
- return (abd);
-}
-
-static void
-abd_free_linear(abd_t *abd)
-{
- if (abd->abd_flags & ABD_FLAG_META) {
- zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
- } else {
- zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
- }
-
- zfs_refcount_destroy(&abd->abd_children);
- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-
- abd_free_struct(abd);
-}
-
-/*
- * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
- * abd_alloc_linear().
- */
-void
-abd_free(abd_t *abd)
-{
- abd_verify(abd);
- ASSERT3P(abd->abd_parent, ==, NULL);
- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
- if (abd_is_linear(abd))
- abd_free_linear(abd);
- else
- abd_free_scatter(abd);
-}
-
-/*
- * Allocate an ABD of the same format (same metadata flag, same scatterize
- * setting) as another ABD.
- */
-abd_t *
-abd_alloc_sametype(abd_t *sabd, size_t size)
-{
- boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
- if (abd_is_linear(sabd)) {
- return (abd_alloc_linear(size, is_metadata));
- } else {
- return (abd_alloc(size, is_metadata));
- }
-}
-
-/*
- * If we're going to use this ABD for doing I/O using the block layer, the
- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
- * plan to store this ABD in memory for a long period of time, we should
- * allocate the ABD type that requires the least data copying to do the I/O.
- *
- * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
- * using a scatter/gather list we should switch to that and replace this call
- * with vanilla abd_alloc().
- */
-abd_t *
-abd_alloc_for_io(size_t size, boolean_t is_metadata)
-{
- return (abd_alloc_linear(size, is_metadata));
-}
-
-/*
- * Allocate a new ABD to point to offset off of sabd. It shares the underlying
- * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
- * any derived ABDs exist.
- */
-abd_t *
-abd_get_offset(abd_t *sabd, size_t off)
-{
- abd_t *abd;
-
- abd_verify(sabd);
- ASSERT3U(off, <=, sabd->abd_size);
-
- if (abd_is_linear(sabd)) {
- abd = abd_alloc_struct(0);
-
- /*
- * Even if this buf is filesystem metadata, we only track that
- * if we own the underlying data buffer, which is not true in
- * this case. Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = ABD_FLAG_LINEAR;
-
- abd->abd_u.abd_linear.abd_buf =
- (char *)sabd->abd_u.abd_linear.abd_buf + off;
- } else {
- size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
- size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
- (new_offset / zfs_abd_chunk_size);
-
- abd = abd_alloc_struct(chunkcnt);
-
- /*
- * Even if this buf is filesystem metadata, we only track that
- * if we own the underlying data buffer, which is not true in
- * this case. Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = 0;
-
- abd->abd_u.abd_scatter.abd_offset =
- new_offset % zfs_abd_chunk_size;
- abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
-
- /* Copy the scatterlist starting at the correct offset */
- (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
- &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
- zfs_abd_chunk_size],
- chunkcnt * sizeof (void *));
- }
-
- abd->abd_size = sabd->abd_size - off;
- abd->abd_parent = sabd;
- zfs_refcount_create(&abd->abd_children);
- (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
-
- return (abd);
-}
-
-/*
- * Allocate a linear ABD structure for buf. You must free this with abd_put()
- * since the resulting ABD doesn't own its own buffer.
- */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
-{
- abd_t *abd = abd_alloc_struct(0);
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- /*
- * Even if this buf is filesystem metadata, we only track that if we
- * own the underlying data buffer, which is not true in this case.
- * Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = ABD_FLAG_LINEAR;
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- abd->abd_u.abd_linear.abd_buf = buf;
-
- return (abd);
-}
-
-/*
- * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
- * free the underlying scatterlist or buffer.
- */
-void
-abd_put(abd_t *abd)
-{
- abd_verify(abd);
- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
-
- if (abd->abd_parent != NULL) {
- (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
- abd->abd_size, abd);
- }
-
- zfs_refcount_destroy(&abd->abd_children);
- abd_free_struct(abd);
-}
-
-/*
- * Get the raw buffer associated with a linear ABD.
- */
-void *
-abd_to_buf(abd_t *abd)
-{
- ASSERT(abd_is_linear(abd));
- abd_verify(abd);
- return (abd->abd_u.abd_linear.abd_buf);
-}
-
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
- void *buf;
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- buf = abd_to_buf(abd);
- } else {
- buf = zio_buf_alloc(n);
- }
- (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-
- return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
- void *buf = abd_borrow_buf(abd, n);
- if (!abd_is_linear(abd)) {
- abd_copy_to_buf(buf, abd, n);
- }
- return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- ASSERT3P(buf, ==, abd_to_buf(abd));
- } else {
- ASSERT0(abd_cmp_buf(abd, buf, n));
- zio_buf_free(buf, n);
- }
- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
- if (!abd_is_linear(abd)) {
- abd_copy_from_buf(abd, buf, n);
- }
- abd_return_buf(abd, buf, n);
-}
-
-/*
- * Give this ABD ownership of the buffer that it's storing. Can only be used on
- * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
- * with abd_alloc_linear() which subsequently released ownership of their buf
- * with abd_release_ownership_of_buf().
- */
-void
-abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
-{
- ASSERT(abd_is_linear(abd));
- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
- abd_verify(abd);
-
- abd->abd_flags |= ABD_FLAG_OWNER;
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
-
- ABDSTAT_BUMP(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
-}
-
-void
-abd_release_ownership_of_buf(abd_t *abd)
-{
- ASSERT(abd_is_linear(abd));
- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
- abd_verify(abd);
-
- abd->abd_flags &= ~ABD_FLAG_OWNER;
- /* Disable this flag since we no longer own the data buffer */
- abd->abd_flags &= ~ABD_FLAG_META;
-
- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-}
-
-struct abd_iter {
- abd_t *iter_abd; /* ABD being iterated through */
- size_t iter_pos; /* position (relative to abd_offset) */
- void *iter_mapaddr; /* addr corresponding to iter_pos */
- size_t iter_mapsize; /* length of data valid at mapaddr */
-};
-
-static inline size_t
-abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
-{
- ASSERT(!abd_is_linear(aiter->iter_abd));
- return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
- aiter->iter_pos) % zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_iter_scatter_chunk_index(struct abd_iter *aiter)
-{
- ASSERT(!abd_is_linear(aiter->iter_abd));
- return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
- aiter->iter_pos) / zfs_abd_chunk_size);
-}
-
-/*
- * Initialize the abd_iter.
- */
-static void
-abd_iter_init(struct abd_iter *aiter, abd_t *abd)
-{
- abd_verify(abd);
- aiter->iter_abd = abd;
- aiter->iter_pos = 0;
- aiter->iter_mapaddr = NULL;
- aiter->iter_mapsize = 0;
-}
-
-/*
- * Advance the iterator by a certain amount. Cannot be called when a chunk is
- * in use. This can be safely called when the aiter has already exhausted, in
- * which case this does nothing.
- */
-static void
-abd_iter_advance(struct abd_iter *aiter, size_t amount)
-{
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
- ASSERT0(aiter->iter_mapsize);
-
- /* There's nothing left to advance to, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- aiter->iter_pos += amount;
-}
-
-/*
- * Map the current chunk into aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_map(struct abd_iter *aiter)
-{
- void *paddr;
- size_t offset = 0;
-
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
- ASSERT0(aiter->iter_mapsize);
-
- /* Panic if someone has changed zfs_abd_chunk_size */
- IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
- aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
-
- /* There's nothing left to iterate over, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- if (abd_is_linear(aiter->iter_abd)) {
- offset = aiter->iter_pos;
- aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
- paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
- } else {
- size_t index = abd_iter_scatter_chunk_index(aiter);
- offset = abd_iter_scatter_chunk_offset(aiter);
- aiter->iter_mapsize = zfs_abd_chunk_size - offset;
- paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
- }
- aiter->iter_mapaddr = (char *)paddr + offset;
-}
-
-/*
- * Unmap the current chunk from aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_unmap(struct abd_iter *aiter)
-{
- /* There's nothing left to unmap, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- ASSERT3P(aiter->iter_mapaddr, !=, NULL);
- ASSERT3U(aiter->iter_mapsize, >, 0);
-
- aiter->iter_mapaddr = NULL;
- aiter->iter_mapsize = 0;
-}
-
-int
-abd_iterate_func(abd_t *abd, size_t off, size_t size,
- abd_iter_func_t *func, void *private)
-{
- int ret = 0;
- struct abd_iter aiter;
-
- abd_verify(abd);
- ASSERT3U(off + size, <=, abd->abd_size);
-
- abd_iter_init(&aiter, abd);
- abd_iter_advance(&aiter, off);
-
- while (size > 0) {
- abd_iter_map(&aiter);
-
- size_t len = MIN(aiter.iter_mapsize, size);
- ASSERT3U(len, >, 0);
-
- ret = func(aiter.iter_mapaddr, len, private);
-
- abd_iter_unmap(&aiter);
-
- if (ret != 0)
- break;
-
- size -= len;
- abd_iter_advance(&aiter, len);
- }
-
- return (ret);
-}
-
-struct buf_arg {
- void *arg_buf;
-};
-
-static int
-abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
-{
- struct buf_arg *ba_ptr = private;
-
- (void) memcpy(ba_ptr->arg_buf, buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (0);
-}
-
-/*
- * Copy abd to buf. (off is the offset in abd.)
- */
-void
-abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { buf };
-
- (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
- &ba_ptr);
-}
-
-static int
-abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
-{
- int ret;
- struct buf_arg *ba_ptr = private;
-
- ret = memcmp(buf, ba_ptr->arg_buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (ret);
-}
-
-/*
- * Compare the contents of abd to buf. (off is the offset in abd.)
- */
-int
-abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { (void *) buf };
-
- return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
-}
-
-static int
-abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
-{
- struct buf_arg *ba_ptr = private;
-
- (void) memcpy(buf, ba_ptr->arg_buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (0);
-}
-
-/*
- * Copy from buf to abd. (off is the offset in abd.)
- */
-void
-abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { (void *) buf };
-
- (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
- &ba_ptr);
-}
-
-/*ARGSUSED*/
-static int
-abd_zero_off_cb(void *buf, size_t size, void *private)
-{
- (void) memset(buf, 0, size);
- return (0);
-}
-
-/*
- * Zero out the abd from a particular offset to the end.
- */
-void
-abd_zero_off(abd_t *abd, size_t off, size_t size)
-{
- (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
-}
-
-/*
- * Iterate over two ABDs and call func incrementally on the two ABDs' data in
- * equal-sized chunks (passed to func as raw buffers). func could be called many
- * times during this iteration.
- */
-int
-abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
- size_t size, abd_iter_func2_t *func, void *private)
-{
- int ret = 0;
- struct abd_iter daiter, saiter;
-
- abd_verify(dabd);
- abd_verify(sabd);
-
- ASSERT3U(doff + size, <=, dabd->abd_size);
- ASSERT3U(soff + size, <=, sabd->abd_size);
-
- abd_iter_init(&daiter, dabd);
- abd_iter_init(&saiter, sabd);
- abd_iter_advance(&daiter, doff);
- abd_iter_advance(&saiter, soff);
-
- while (size > 0) {
- abd_iter_map(&daiter);
- abd_iter_map(&saiter);
-
- size_t dlen = MIN(daiter.iter_mapsize, size);
- size_t slen = MIN(saiter.iter_mapsize, size);
- size_t len = MIN(dlen, slen);
- ASSERT(dlen > 0 || slen > 0);
-
- ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
- private);
-
- abd_iter_unmap(&saiter);
- abd_iter_unmap(&daiter);
-
- if (ret != 0)
- break;
-
- size -= len;
- abd_iter_advance(&daiter, len);
- abd_iter_advance(&saiter, len);
- }
-
- return (ret);
-}
-
-/*ARGSUSED*/
-static int
-abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
-{
- (void) memcpy(dbuf, sbuf, size);
- return (0);
-}
-
-/*
- * Copy from sabd to dabd starting from soff and doff.
- */
-void
-abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
-{
- (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
- abd_copy_off_cb, NULL);
-}
-
-/*ARGSUSED*/
-static int
-abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
-{
- return (memcmp(bufa, bufb, size));
-}
-
-/*
- * Compares the first size bytes of two ABDs.
- */
-int
-abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
-{
- return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
deleted file mode 100644
index 713ff2b0116c..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/aggsum.h>
-
-/*
- * Aggregate-sum counters are a form of fanned-out counter, used when atomic
- * instructions on a single field cause enough CPU cache line contention to
- * slow system performance. Due to their increased overhead and the expense
- * involved with precisely reading from them, they should only be used in cases
- * where the write rate (increment/decrement) is much higher than the read rate
- * (get value).
- *
- * Aggregate sum counters are comprised of two basic parts, the core and the
- * buckets. The core counter contains a lock for the entire counter, as well
- * as the current upper and lower bounds on the value of the counter. The
- * aggsum_bucket structure contains a per-bucket lock to protect the contents of
- * the bucket, the current amount that this bucket has changed from the global
- * counter (called the delta), and the amount of increment and decrement we have
- * "borrowed" from the core counter.
- *
- * The basic operation of an aggsum is simple. Threads that wish to modify the
- * counter will modify one bucket's counter (determined by their current CPU, to
- * help minimize lock and cache contention). If the bucket already has
- * sufficient capacity borrowed from the core structure to handle their request,
- * they simply modify the delta and return. If the bucket does not, we clear
- * the bucket's current state (to prevent the borrowed amounts from getting too
- * large), and borrow more from the core counter. Borrowing is done by adding to
- * the upper bound (or subtracting from the lower bound) of the core counter,
- * and setting the borrow value for the bucket to the amount added (or
- * subtracted). Clearing the bucket is the opposite; we add the current delta
- * to both the lower and upper bounds of the core counter, subtract the borrowed
- * incremental from the upper bound, and add the borrowed decrement from the
- * lower bound. Note that only borrowing and clearing require access to the
- * core counter; since all other operations access CPU-local resources,
- * performance can be much higher than a traditional counter.
- *
- * Threads that wish to read from the counter have a slightly more challenging
- * task. It is fast to determine the upper and lower bounds of the aggum; this
- * does not require grabbing any locks. This suffices for cases where an
- * approximation of the aggsum's value is acceptable. However, if one needs to
- * know whether some specific value is above or below the current value in the
- * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
- * comparing the target value to the upper and lower bounds of the aggsum, and
- * then clearing a bucket. This proceeds until the target is outside of the
- * upper and lower bounds and we return a response, or the last bucket has been
- * cleared and we know that the target is equal to the aggsum's value. Finally,
- * the most expensive operation is determining the precise value of the aggsum.
- * To do this, we clear every bucket and then return the upper bound (which must
- * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
- * expensive is clearing buckets. This involves grabbing the global lock
- * (serializing against themselves and borrow operations), grabbing a bucket's
- * lock (preventing threads on those CPUs from modifying their delta), and
- * zeroing out the borrowed value (forcing that thread to borrow on its next
- * request, which will also be expensive). This is what makes aggsums well
- * suited for write-many read-rarely operations.
- */
-
-/*
- * We will borrow aggsum_borrow_multiplier times the current request, so we will
- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
- * aggsum_delta().
- */
-static uint_t aggsum_borrow_multiplier = 10;
-
-void
-aggsum_init(aggsum_t *as, uint64_t value)
-{
- bzero(as, sizeof (*as));
- as->as_lower_bound = as->as_upper_bound = value;
- mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
- as->as_numbuckets = boot_ncpus;
- as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
- KM_SLEEP);
- for (int i = 0; i < as->as_numbuckets; i++) {
- mutex_init(&as->as_buckets[i].asc_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
-}
-
-void
-aggsum_fini(aggsum_t *as)
-{
- for (int i = 0; i < as->as_numbuckets; i++)
- mutex_destroy(&as->as_buckets[i].asc_lock);
- kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
- mutex_destroy(&as->as_lock);
-}
-
-int64_t
-aggsum_lower_bound(aggsum_t *as)
-{
- return (as->as_lower_bound);
-}
-
-int64_t
-aggsum_upper_bound(aggsum_t *as)
-{
- return (as->as_upper_bound);
-}
-
-static void
-aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
-{
- ASSERT(MUTEX_HELD(&as->as_lock));
- ASSERT(MUTEX_HELD(&asb->asc_lock));
-
- /*
- * We use atomic instructions for this because we read the upper and
- * lower bounds without the lock, so we need stores to be atomic.
- */
- atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
- asb->asc_delta + asb->asc_borrowed);
- atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
- asb->asc_delta - asb->asc_borrowed);
- asb->asc_delta = 0;
- asb->asc_borrowed = 0;
-}
-
-uint64_t
-aggsum_value(aggsum_t *as)
-{
- int64_t rv;
-
- mutex_enter(&as->as_lock);
- if (as->as_lower_bound == as->as_upper_bound) {
- rv = as->as_lower_bound;
- for (int i = 0; i < as->as_numbuckets; i++) {
- ASSERT0(as->as_buckets[i].asc_delta);
- ASSERT0(as->as_buckets[i].asc_borrowed);
- }
- mutex_exit(&as->as_lock);
- return (rv);
- }
- for (int i = 0; i < as->as_numbuckets; i++) {
- struct aggsum_bucket *asb = &as->as_buckets[i];
- mutex_enter(&asb->asc_lock);
- aggsum_flush_bucket(as, asb);
- mutex_exit(&asb->asc_lock);
- }
- VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
- rv = as->as_lower_bound;
- mutex_exit(&as->as_lock);
-
- return (rv);
-}
-
-void
-aggsum_add(aggsum_t *as, int64_t delta)
-{
- struct aggsum_bucket *asb =
- &as->as_buckets[CPU_SEQID % as->as_numbuckets];
- int64_t borrow;
-
- /* Try fast path if we already borrowed enough before. */
- mutex_enter(&asb->asc_lock);
- if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
- asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
- asb->asc_delta += delta;
- mutex_exit(&asb->asc_lock);
- return;
- }
- mutex_exit(&asb->asc_lock);
-
- /*
- * We haven't borrowed enough. Take the global lock and borrow
- * considering what is requested now and what we borrowed before.
- */
- borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
- mutex_enter(&as->as_lock);
- mutex_enter(&asb->asc_lock);
- delta += asb->asc_delta;
- asb->asc_delta = 0;
- if (borrow >= asb->asc_borrowed)
- borrow -= asb->asc_borrowed;
- else
- borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
- asb->asc_borrowed += borrow;
- atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
- delta - borrow);
- atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
- delta + borrow);
- mutex_exit(&asb->asc_lock);
- mutex_exit(&as->as_lock);
-}
-
-/*
- * Compare the aggsum value to target efficiently. Returns -1 if the value
- * represented by the aggsum is less than target, 1 if it's greater, and 0 if
- * they are equal.
- */
-int
-aggsum_compare(aggsum_t *as, uint64_t target)
-{
- if (as->as_upper_bound < target)
- return (-1);
- if (as->as_lower_bound > target)
- return (1);
- mutex_enter(&as->as_lock);
- for (int i = 0; i < as->as_numbuckets; i++) {
- struct aggsum_bucket *asb = &as->as_buckets[i];
- mutex_enter(&asb->asc_lock);
- aggsum_flush_bucket(as, asb);
- mutex_exit(&asb->asc_lock);
- if (as->as_upper_bound < target) {
- mutex_exit(&as->as_lock);
- return (-1);
- }
- if (as->as_lower_bound > target) {
- mutex_exit(&as->as_lock);
- return (1);
- }
- }
- VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
- ASSERT3U(as->as_lower_bound, ==, target);
- mutex_exit(&as->as_lock);
- return (0);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
deleted file mode 100644
index 592fb02cfac1..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ /dev/null
@@ -1,8569 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
- */
-
-/*
- * DVA-based Adjustable Replacement Cache
- *
- * While much of the theory of operation used here is
- * based on the self-tuning, low overhead replacement cache
- * presented by Megiddo and Modha at FAST 2003, there are some
- * significant differences:
- *
- * 1. The Megiddo and Modha model assumes any page is evictable.
- * Pages in its cache cannot be "locked" into memory. This makes
- * the eviction algorithm simple: evict the last page in the list.
- * This also make the performance characteristics easy to reason
- * about. Our cache is not so simple. At any given moment, some
- * subset of the blocks in the cache are un-evictable because we
- * have handed out a reference to them. Blocks are only evictable
- * when there are no external references active. This makes
- * eviction far more problematic: we choose to evict the evictable
- * blocks that are the "lowest" in the list.
- *
- * There are times when it is not possible to evict the requested
- * space. In these circumstances we are unable to adjust the cache
- * size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slows the flow of new data
- * into the cache until we can make space available.
- *
- * 2. The Megiddo and Modha model assumes a fixed cache size.
- * Pages are evicted when the cache is full and there is a cache
- * miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory pressure from the
- * operating system: decreasing its size when system memory is
- * tight.
- *
- * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefore exactly the same size. So
- * when adjusting the cache size following a cache miss, its simply
- * a matter of choosing a single page to evict. In our model, we
- * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes). We therefore choose a set of blocks to evict to make
- * space for a cache miss that approximates as closely as possible
- * the space used by the new block.
- *
- * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
- * by N. Megiddo & D. Modha, FAST 2003
- */
-
-/*
- * The locking model:
- *
- * A new reference to a cache buffer can be obtained in two
- * ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() interface
- * uses method 1, while the internal ARC algorithms for
- * adjusting the cache use method 2. We therefore provide two
- * types of locks: 1) the hash table lock array, and 2) the
- * ARC list locks.
- *
- * Buffers do not have their own mutexes, rather they rely on the
- * hash table mutexes for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexes).
- *
- * buf_hash_find() returns the appropriate mutex (held) when it
- * locates the requested buffer in the hash table. It returns
- * NULL for the mutex if the buffer was not in the table.
- *
- * buf_hash_remove() expects the appropriate hash mutex to be
- * already held before it is invoked.
- *
- * Each ARC state also has a mutex which is used to protect the
- * buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an ARC list lock you
- * must use: mutex_tryenter() to avoid deadlock. Also note that
- * the active state mutex must be held before the ghost state mutex.
- *
- * It as also possible to register a callback which is run when the
- * arc_meta_limit is reached and no buffers can be safely evicted. In
- * this case the arc user should drop a reference on some arc buffers so
- * they can be reclaimed and the arc_meta_limit honored. For example,
- * when using the ZPL each dentry holds a references on a znode. These
- * dentries must be pruned before the arc buffer holding the znode can
- * be safely evicted.
- *
- * Note that the majority of the performance stats are manipulated
- * with atomic operations.
- *
- * The L2ARC uses the l2ad_mtx on each vdev for the following:
- *
- * - L2ARC buflist creation
- * - L2ARC buflist eviction
- * - L2ARC write completion, which walks L2ARC buflists
- * - ARC header destruction, as it removes from L2ARC buflists
- * - ARC header release, as it removes from L2ARC buflists
- */
-
-/*
- * ARC operation:
- *
- * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
- * This structure can point either to a block that is still in the cache or to
- * one that is only accessible in an L2 ARC device, or it can provide
- * information about a block that was recently evicted. If a block is
- * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
- * information to retrieve it from the L2ARC device. This information is
- * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
- * that is in this state cannot access the data directly.
- *
- * Blocks that are actively being referenced or have not been evicted
- * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
- * the arc_buf_hdr_t that will point to the data block in memory. A block can
- * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
- * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
- *
- * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
- * ability to store the physical data (b_pabd) associated with the DVA of the
- * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
- * it will match its on-disk compression characteristics. This behavior can be
- * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pabd will point to an
- * uncompressed version of the on-disk data.
- *
- * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
- * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
- * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer. The ARC will provide references to this data and will keep it
- * cached until it is no longer in use. The ARC caches only the L1ARC's physical
- * data block and will evict any arc_buf_t that is no longer referenced. The
- * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
- * "overhead_size" kstat.
- *
- * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
- * compressed form. The typical case is that consumers will want uncompressed
- * data, and when that happens a new data buffer is allocated where the data is
- * decompressed for them to use. Currently the only consumer who wants
- * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
- * exists on disk. When this happens, the arc_buf_t's data buffer is shared
- * with the arc_buf_hdr_t.
- *
- * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
- * first one is owned by a compressed send consumer (and therefore references
- * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
- * used by any other consumer (and has its own uncompressed copy of the data
- * buffer).
- *
- * arc_buf_hdr_t
- * +-----------+
- * | fields |
- * | common to |
- * | L1- and |
- * | L2ARC |
- * +-----------+
- * | l2arc_buf_hdr_t
- * | |
- * +-----------+
- * | l1arc_buf_hdr_t
- * | | arc_buf_t
- * | b_buf +------------>+-----------+ arc_buf_t
- * | b_pabd +-+ |b_next +---->+-----------+
- * +-----------+ | |-----------| |b_next +-->NULL
- * | |b_comp = T | +-----------+
- * | |b_data +-+ |b_comp = F |
- * | +-----------+ | |b_data +-+
- * +->+------+ | +-----------+ |
- * compressed | | | |
- * data | |<--------------+ | uncompressed
- * +------+ compressed, | data
- * shared +-->+------+
- * data | |
- * | |
- * +------+
- *
- * When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
- * arc_buf_t and either copies uncompressed data into a new data buffer from an
- * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
- * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
- * hdr is compressed and the desired compression characteristics of the
- * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
- * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
- * the last buffer in the hdr's b_buf list, however a shared compressed buf can
- * be anywhere in the hdr's list.
- *
- * The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
- * the last element in the buf list):
- *
- * arc_buf_hdr_t
- * +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- * l2arc_buf_hdr_t| |
- * | |
- * +-----------+
- * l1arc_buf_hdr_t| |
- * | | arc_buf_t (shared)
- * | b_buf +------------>+---------+ arc_buf_t
- * | | |b_next +---->+---------+
- * | b_pabd +-+ |---------| |b_next +-->NULL
- * +-----------+ | | | +---------+
- * | |b_data +-+ | |
- * | +---------+ | |b_data +-+
- * +->+------+ | +---------+ |
- * | | | |
- * uncompressed | | | |
- * data +------+ | |
- * ^ +->+------+ |
- * | uncompressed | | |
- * | data | | |
- * | +------+ |
- * +---------------------------------+
- *
- * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
- * since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
- * it may compress the data before writing it to disk. The ARC will be called
- * with the transformed data and will bcopy the transformed on-disk block into
- * a newly allocated b_pabd. Writes are always done into buffers which have
- * either been loaned (and hence are new and don't have other readers) or
- * buffers which have been released (and hence have their own hdr, if there
- * were originally other readers of the buf's original hdr). This ensures that
- * the ARC only needs to update a single buf and its hdr after a write occurs.
- *
- * When the L2ARC is in use, it will also take advantage of the b_pabd. The
- * L2ARC will always write the contents of b_pabd to the L2ARC. This means
- * that when compressed ARC is enabled that the L2ARC blocks are identical
- * to the on-disk block in the main data pool. This provides a significant
- * advantage since the ARC can leverage the bp's checksum when reading from the
- * L2ARC to determine if the contents are valid. However, if the compressed
- * ARC is disabled, then the L2ARC's block must be transformed to look
- * like the physical block in the main data pool before comparing the
- * checksum and determining its validity.
- */
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_context.h>
-#include <sys/arc.h>
-#include <sys/refcount.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/dsl_pool.h>
-#include <sys/zio_checksum.h>
-#include <sys/multilist.h>
-#include <sys/abd.h>
-#ifdef _KERNEL
-#include <sys/dnlc.h>
-#include <sys/racct.h>
-#endif
-#include <sys/callb.h>
-#include <sys/kstat.h>
-#include <sys/trim_map.h>
-#include <sys/zthr.h>
-#include <zfs_fletcher.h>
-#include <sys/sdt.h>
-#include <sys/aggsum.h>
-#include <sys/cityhash.h>
-
-#include <machine/vmparam.h>
-
-#ifdef illumos
-#ifndef _KERNEL
-/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
-boolean_t arc_watch = B_FALSE;
-int arc_procfd;
-#endif
-#endif /* illumos */
-
-/*
- * This thread's job is to keep enough free memory in the system, by
- * calling arc_kmem_reap_now() plus arc_shrink(), which improves
- * arc_available_memory().
- */
-static zthr_t *arc_reap_zthr;
-
-/*
- * This thread's job is to keep arc_size under arc_c, by calling
- * arc_adjust(), which improves arc_is_overflowing().
- */
-static zthr_t *arc_adjust_zthr;
-
-static kmutex_t arc_adjust_lock;
-static kcondvar_t arc_adjust_waiters_cv;
-static boolean_t arc_adjust_needed = B_FALSE;
-
-static kmutex_t arc_dnlc_evicts_lock;
-static kcondvar_t arc_dnlc_evicts_cv;
-static boolean_t arc_dnlc_evicts_thread_exit;
-
-uint_t arc_reduce_dnlc_percent = 3;
-
-/*
- * The number of headers to evict in arc_evict_state_impl() before
- * dropping the sublist lock and evicting from another sublist. A lower
- * value means we're more likely to evict the "correct" header (i.e. the
- * oldest header in the arc state), but comes with higher overhead
- * (i.e. more invocations of arc_evict_state_impl()).
- */
-int zfs_arc_evict_batch_limit = 10;
-
-/* number of seconds before growing cache again */
-int arc_grow_retry = 60;
-
-/*
- * Minimum time between calls to arc_kmem_reap_soon(). Note that this will
- * be converted to ticks, so with the default hz=100, a setting of 15 ms
- * will actually wait 2 ticks, or 20ms.
- */
-int arc_kmem_cache_reap_retry_ms = 1000;
-
-/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
-
-/* shift of arc_c for calculating both min and max arc_p */
-int arc_p_min_shift = 4;
-
-/* log2(fraction of arc to reclaim) */
-int arc_shrink_shift = 7;
-
-/*
- * log2(fraction of ARC which must be free to allow growing).
- * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
- * when reading a new block into the ARC, we will evict an equal-sized block
- * from the ARC.
- *
- * This must be less than arc_shrink_shift, so that when we shrink the ARC,
- * we will still not allow it to grow.
- */
-int arc_no_grow_shift = 5;
-
-
-/*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
- */
-static int zfs_arc_min_prefetch_ms = 1;
-static int zfs_arc_min_prescient_prefetch_ms = 6;
-
-/*
- * If this percent of memory is free, don't throttle.
- */
-int arc_lotsfree_percent = 10;
-
-static boolean_t arc_initialized;
-extern boolean_t zfs_prefetch_disable;
-
-/*
- * The arc has filled available memory and has now warmed up.
- */
-static boolean_t arc_warm;
-
-/*
- * log2 fraction of the zio arena to keep free.
- */
-int arc_zio_arena_free_shift = 2;
-
-/*
- * These tunables are for performance analysis.
- */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-uint64_t zfs_arc_meta_min = 0;
-uint64_t zfs_arc_dnode_limit = 0;
-uint64_t zfs_arc_dnode_reduce_percent = 10;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_no_grow_shift = 0;
-int zfs_arc_p_min_shift = 0;
-uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
-u_int zfs_arc_free_target = 0;
-
-/* Absolute min for arc min / max is 16MB. */
-static uint64_t arc_abs_min = 16 << 20;
-
-/*
- * ARC dirty data constraints for arc_tempreserve_space() throttle
- */
-uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
-uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
-uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
-
-boolean_t zfs_compressed_arc_enabled = B_TRUE;
-
-static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-static void
-arc_free_target_init(void *unused __unused)
-{
-
- zfs_arc_free_target = vm_cnt.v_free_target;
-}
-SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
- arc_free_target_init, NULL);
-
-TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
-TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
-TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
-TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
-TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
- CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
- 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
- CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
- 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
- CTLTYPE_U32 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
- 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
- "log2(fraction of ARC which must be free to allow growing)");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
- &zfs_arc_average_blocksize, 0,
- "ARC average blocksize");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
- &arc_shrink_shift, 0,
- "log2(fraction of arc to reclaim)");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
- &arc_grow_retry, 0,
- "Wait in seconds before considering growing ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
- &zfs_compressed_arc_enabled, 0,
- "Enable compressed ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN,
- &arc_kmem_cache_reap_retry_ms, 0,
- "Interval between ARC kmem_cache reapings");
-
-/*
- * We don't have a tunable for arc_free_target due to the dependency on
- * pagedaemon initialisation.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
- CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
- sysctl_vfs_zfs_arc_free_target, "IU",
- "Desired number of free pages below which ARC triggers reclaim");
-
-static int
-sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
-{
- u_int val;
- int err;
-
- val = zfs_arc_free_target;
- err = sysctl_handle_int(oidp, &val, 0, req);
- if (err != 0 || req->newptr == NULL)
- return (err);
-
- if (val < minfree)
- return (EINVAL);
- if (val > vm_cnt.v_page_count)
- return (EINVAL);
-
- zfs_arc_free_target = val;
-
- return (0);
-}
-
-/*
- * Must be declared here, before the definition of corresponding kstat
- * macro which uses the same names will confuse the compiler.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
- CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
- sysctl_vfs_zfs_arc_meta_limit, "QU",
- "ARC metadata limit");
-#endif
-
-/*
- * Note that buffers can be in one of 6 states:
- * ARC_anon - anonymous (discussed below)
- * ARC_mru - recently used, currently cached
- * ARC_mru_ghost - recentely used, no longer in cache
- * ARC_mfu - frequently used, currently cached
- * ARC_mfu_ghost - frequently used, no longer in cache
- * ARC_l2c_only - exists in L2ARC but not other states
- * When there are no active references to the buffer, they are
- * are linked onto a list in one of these arc states. These are
- * the only buffers that can be evicted or deleted. Within each
- * state there are multiple lists, one for meta-data and one for
- * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
- * etc.) is tracked separately so that it can be managed more
- * explicitly: favored over data, limited explicitly.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA. These are buffers that hold dirty block copies
- * before they are written to stable storage. By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed. Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- *
- * The ARC_l2c_only state is for buffers that are in the second
- * level ARC but no longer in any of the ARC_m* lists. The second
- * level ARC itself may also contain buffers that are in any of
- * the ARC_m* states - meaning that a buffer can exist in two
- * places. The reason for the ARC_l2c_only state is to keep the
- * buffer header in the hash table, so that reads that hit the
- * second level ARC benefit from these fast lookups.
- */
-
-typedef struct arc_state {
- /*
- * list of evictable buffers
- */
- multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
- /*
- * total amount of evictable data in this state
- */
- zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
- /*
- * total amount of data in this state; this includes: evictable,
- * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
- */
- zfs_refcount_t arcs_size;
- /*
- * supports the "dbufs" kstat
- */
- arc_state_type_t arcs_state;
-} arc_state_t;
-
-/*
- * Percentage that can be consumed by dnodes of ARC meta buffers.
- */
-int zfs_arc_meta_prune = 10000;
-unsigned long zfs_arc_dnode_limit_percent = 10;
-int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY;
-int zfs_arc_meta_adjust_restarts = 4096;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN,
- &zfs_arc_meta_strategy, 0,
- "ARC metadata reclamation strategy "
- "(0 = metadata only, 1 = balance data and metadata)");
-
-/* The 6 states: */
-static arc_state_t ARC_anon;
-static arc_state_t ARC_mru;
-static arc_state_t ARC_mru_ghost;
-static arc_state_t ARC_mfu;
-static arc_state_t ARC_mfu_ghost;
-static arc_state_t ARC_l2c_only;
-
-typedef struct arc_stats {
- kstat_named_t arcstat_hits;
- kstat_named_t arcstat_misses;
- kstat_named_t arcstat_demand_data_hits;
- kstat_named_t arcstat_demand_data_misses;
- kstat_named_t arcstat_demand_metadata_hits;
- kstat_named_t arcstat_demand_metadata_misses;
- kstat_named_t arcstat_prefetch_data_hits;
- kstat_named_t arcstat_prefetch_data_misses;
- kstat_named_t arcstat_prefetch_metadata_hits;
- kstat_named_t arcstat_prefetch_metadata_misses;
- kstat_named_t arcstat_mru_hits;
- kstat_named_t arcstat_mru_ghost_hits;
- kstat_named_t arcstat_mfu_hits;
- kstat_named_t arcstat_mfu_ghost_hits;
- kstat_named_t arcstat_allocated;
- kstat_named_t arcstat_deleted;
- /*
- * Number of buffers that could not be evicted because the hash lock
- * was held by another thread. The lock may not necessarily be held
- * by something using the same buffer, since hash locks are shared
- * by multiple buffers.
- */
- kstat_named_t arcstat_mutex_miss;
- /*
- * Number of buffers skipped when updating the access state due to the
- * header having already been released after acquiring the hash lock.
- */
- kstat_named_t arcstat_access_skip;
- /*
- * Number of buffers skipped because they have I/O in progress, are
- * indirect prefetch buffers that have not lived long enough, or are
- * not from the spa we're trying to evict from.
- */
- kstat_named_t arcstat_evict_skip;
- /*
- * Number of times arc_evict_state() was unable to evict enough
- * buffers to reach it's target amount.
- */
- kstat_named_t arcstat_evict_not_enough;
- kstat_named_t arcstat_evict_l2_cached;
- kstat_named_t arcstat_evict_l2_eligible;
- kstat_named_t arcstat_evict_l2_ineligible;
- kstat_named_t arcstat_evict_l2_skip;
- kstat_named_t arcstat_hash_elements;
- kstat_named_t arcstat_hash_elements_max;
- kstat_named_t arcstat_hash_collisions;
- kstat_named_t arcstat_hash_chains;
- kstat_named_t arcstat_hash_chain_max;
- kstat_named_t arcstat_p;
- kstat_named_t arcstat_c;
- kstat_named_t arcstat_c_min;
- kstat_named_t arcstat_c_max;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_size;
- /*
- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
- * Note that the compressed bytes may match the uncompressed bytes
- * if the block is either not compressed or compressed arc is disabled.
- */
- kstat_named_t arcstat_compressed_size;
- /*
- * Uncompressed size of the data stored in b_pabd. If compressed
- * arc is disabled then this value will be identical to the stat
- * above.
- */
- kstat_named_t arcstat_uncompressed_size;
- /*
- * Number of bytes stored in all the arc_buf_t's. This is classified
- * as "overhead" since this data is typically short-lived and will
- * be evicted from the arc when it becomes unreferenced unless the
- * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
- * values have been set (see comment in dbuf.c for more information).
- */
- kstat_named_t arcstat_overhead_size;
- /*
- * Number of bytes consumed by internal ARC structures necessary
- * for tracking purposes; these structures are not actually
- * backed by ARC buffers. This includes arc_buf_hdr_t structures
- * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
- * caches), and arc_buf_t structures (allocated via arc_buf_t
- * cache).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_hdr_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_DATA. This is generally consumed by buffers backing
- * on disk user data (e.g. plain file contents).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_data_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_METADATA. This is generally consumed by buffers
- * backing on disk data that is used for internal ZFS
- * structures (e.g. ZAP, dnode, indirect blocks, etc).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_metadata_size;
- /*
- * Number of bytes consumed by dmu_buf_impl_t objects.
- */
- kstat_named_t arcstat_dbuf_size;
- /*
- * Number of bytes consumed by dnode_t objects.
- */
- kstat_named_t arcstat_dnode_size;
- /*
- * Number of bytes consumed by bonus buffers.
- */
- kstat_named_t arcstat_bonus_size;
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
- /*
- * Sum of the previous three counters, provided for compatibility.
- */
- kstat_named_t arcstat_other_size;
-#endif
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_anon state. This includes *all* buffers in the arc_anon
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mru state. This includes *all* buffers in the arc_mru
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mru_ghost state. The key thing to note
- * here, is the fact that this size doesn't actually indicate
- * RAM consumption. The ghost lists only consist of headers and
- * don't actually have ARC buffers linked off of these headers.
- * Thus, *if* the headers had associated ARC buffers, these
- * buffers *would have* consumed this number of bytes.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mfu state. This includes *all* buffers in the arc_mfu
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_size;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
- * state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_METADATA, and reside in the
- * arc_mfu state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mfu_ghost state. See the comment above
- * arcstat_mru_ghost_size for more details.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_metadata;
- kstat_named_t arcstat_l2_hits;
- kstat_named_t arcstat_l2_misses;
- kstat_named_t arcstat_l2_feeds;
- kstat_named_t arcstat_l2_rw_clash;
- kstat_named_t arcstat_l2_read_bytes;
- kstat_named_t arcstat_l2_write_bytes;
- kstat_named_t arcstat_l2_writes_sent;
- kstat_named_t arcstat_l2_writes_done;
- kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_lock_retry;
- kstat_named_t arcstat_l2_evict_lock_retry;
- kstat_named_t arcstat_l2_evict_reading;
- kstat_named_t arcstat_l2_evict_l1cached;
- kstat_named_t arcstat_l2_free_on_write;
- kstat_named_t arcstat_l2_abort_lowmem;
- kstat_named_t arcstat_l2_cksum_bad;
- kstat_named_t arcstat_l2_io_error;
- kstat_named_t arcstat_l2_lsize;
- kstat_named_t arcstat_l2_psize;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_l2_write_trylock_fail;
- kstat_named_t arcstat_l2_write_passed_headroom;
- kstat_named_t arcstat_l2_write_spa_mismatch;
- kstat_named_t arcstat_l2_write_in_l2;
- kstat_named_t arcstat_l2_write_hdr_io_in_progress;
- kstat_named_t arcstat_l2_write_not_cacheable;
- kstat_named_t arcstat_l2_write_full;
- kstat_named_t arcstat_l2_write_buffer_iter;
- kstat_named_t arcstat_l2_write_pios;
- kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
- kstat_named_t arcstat_l2_write_buffer_list_iter;
- kstat_named_t arcstat_l2_write_buffer_list_null_iter;
- kstat_named_t arcstat_memory_throttle_count;
- kstat_named_t arcstat_memory_direct_count;
- kstat_named_t arcstat_memory_indirect_count;
- kstat_named_t arcstat_memory_all_bytes;
- kstat_named_t arcstat_memory_free_bytes;
- kstat_named_t arcstat_memory_available_bytes;
- kstat_named_t arcstat_no_grow;
- kstat_named_t arcstat_tempreserve;
- kstat_named_t arcstat_loaned_bytes;
- kstat_named_t arcstat_prune;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_meta_used;
- kstat_named_t arcstat_meta_limit;
- kstat_named_t arcstat_dnode_limit;
- kstat_named_t arcstat_meta_max;
- kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_async_upgrade_sync;
- kstat_named_t arcstat_demand_hit_predictive_prefetch;
- kstat_named_t arcstat_demand_hit_prescient_prefetch;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
- { "hits", KSTAT_DATA_UINT64 },
- { "misses", KSTAT_DATA_UINT64 },
- { "demand_data_hits", KSTAT_DATA_UINT64 },
- { "demand_data_misses", KSTAT_DATA_UINT64 },
- { "demand_metadata_hits", KSTAT_DATA_UINT64 },
- { "demand_metadata_misses", KSTAT_DATA_UINT64 },
- { "prefetch_data_hits", KSTAT_DATA_UINT64 },
- { "prefetch_data_misses", KSTAT_DATA_UINT64 },
- { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
- { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
- { "mru_hits", KSTAT_DATA_UINT64 },
- { "mru_ghost_hits", KSTAT_DATA_UINT64 },
- { "mfu_hits", KSTAT_DATA_UINT64 },
- { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
- { "allocated", KSTAT_DATA_UINT64 },
- { "deleted", KSTAT_DATA_UINT64 },
- { "mutex_miss", KSTAT_DATA_UINT64 },
- { "access_skip", KSTAT_DATA_UINT64 },
- { "evict_skip", KSTAT_DATA_UINT64 },
- { "evict_not_enough", KSTAT_DATA_UINT64 },
- { "evict_l2_cached", KSTAT_DATA_UINT64 },
- { "evict_l2_eligible", KSTAT_DATA_UINT64 },
- { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
- { "evict_l2_skip", KSTAT_DATA_UINT64 },
- { "hash_elements", KSTAT_DATA_UINT64 },
- { "hash_elements_max", KSTAT_DATA_UINT64 },
- { "hash_collisions", KSTAT_DATA_UINT64 },
- { "hash_chains", KSTAT_DATA_UINT64 },
- { "hash_chain_max", KSTAT_DATA_UINT64 },
- { "p", KSTAT_DATA_UINT64 },
- { "c", KSTAT_DATA_UINT64 },
- { "c_min", KSTAT_DATA_UINT64 },
- { "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 },
- { "compressed_size", KSTAT_DATA_UINT64 },
- { "uncompressed_size", KSTAT_DATA_UINT64 },
- { "overhead_size", KSTAT_DATA_UINT64 },
- { "hdr_size", KSTAT_DATA_UINT64 },
- { "data_size", KSTAT_DATA_UINT64 },
- { "metadata_size", KSTAT_DATA_UINT64 },
- { "dbuf_size", KSTAT_DATA_UINT64 },
- { "dnode_size", KSTAT_DATA_UINT64 },
- { "bonus_size", KSTAT_DATA_UINT64 },
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
- { "other_size", KSTAT_DATA_UINT64 },
-#endif
- { "anon_size", KSTAT_DATA_UINT64 },
- { "anon_evictable_data", KSTAT_DATA_UINT64 },
- { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
- { "mru_size", KSTAT_DATA_UINT64 },
- { "mru_evictable_data", KSTAT_DATA_UINT64 },
- { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
- { "mru_ghost_size", KSTAT_DATA_UINT64 },
- { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
- { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
- { "mfu_size", KSTAT_DATA_UINT64 },
- { "mfu_evictable_data", KSTAT_DATA_UINT64 },
- { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
- { "mfu_ghost_size", KSTAT_DATA_UINT64 },
- { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
- { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
- { "l2_hits", KSTAT_DATA_UINT64 },
- { "l2_misses", KSTAT_DATA_UINT64 },
- { "l2_feeds", KSTAT_DATA_UINT64 },
- { "l2_rw_clash", KSTAT_DATA_UINT64 },
- { "l2_read_bytes", KSTAT_DATA_UINT64 },
- { "l2_write_bytes", KSTAT_DATA_UINT64 },
- { "l2_writes_sent", KSTAT_DATA_UINT64 },
- { "l2_writes_done", KSTAT_DATA_UINT64 },
- { "l2_writes_error", KSTAT_DATA_UINT64 },
- { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
- { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
- { "l2_evict_reading", KSTAT_DATA_UINT64 },
- { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
- { "l2_free_on_write", KSTAT_DATA_UINT64 },
- { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
- { "l2_cksum_bad", KSTAT_DATA_UINT64 },
- { "l2_io_error", KSTAT_DATA_UINT64 },
- { "l2_size", KSTAT_DATA_UINT64 },
- { "l2_asize", KSTAT_DATA_UINT64 },
- { "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
- { "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
- { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
- { "l2_write_in_l2", KSTAT_DATA_UINT64 },
- { "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
- { "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
- { "l2_write_full", KSTAT_DATA_UINT64 },
- { "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
- { "l2_write_pios", KSTAT_DATA_UINT64 },
- { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
- { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
- { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 },
- { "memory_direct_count", KSTAT_DATA_UINT64 },
- { "memory_indirect_count", KSTAT_DATA_UINT64 },
- { "memory_all_bytes", KSTAT_DATA_UINT64 },
- { "memory_free_bytes", KSTAT_DATA_UINT64 },
- { "memory_available_bytes", KSTAT_DATA_UINT64 },
- { "arc_no_grow", KSTAT_DATA_UINT64 },
- { "arc_tempreserve", KSTAT_DATA_UINT64 },
- { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
- { "arc_prune", KSTAT_DATA_UINT64 },
- { "arc_meta_used", KSTAT_DATA_UINT64 },
- { "arc_meta_limit", KSTAT_DATA_UINT64 },
- { "arc_dnode_limit", KSTAT_DATA_UINT64 },
- { "arc_meta_max", KSTAT_DATA_UINT64 },
- { "arc_meta_min", KSTAT_DATA_UINT64 },
- { "async_upgrade_sync", KSTAT_DATA_UINT64 },
- { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
- { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
-};
-
-#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
-
-#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val))
-
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
-#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
-
-#define ARCSTAT_MAX(stat, val) { \
- uint64_t m; \
- while ((val) > (m = arc_stats.stat.value.ui64) && \
- (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
- continue; \
-}
-
-#define ARCSTAT_MAXSTAT(stat) \
- ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
-
-/*
- * We define a macro to allow ARC hits/misses to be easily broken down by
- * two separate conditions, giving a total of four different subtypes for
- * each of hits and misses (so eight statistics total).
- */
-#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
- if (cond1) { \
- if (cond2) { \
- ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
- } else { \
- ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
- } \
- } else { \
- if (cond2) { \
- ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
- } else { \
- ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
- } \
- }
-
-kstat_t *arc_ksp;
-static arc_state_t *arc_anon;
-static arc_state_t *arc_mru;
-static arc_state_t *arc_mru_ghost;
-static arc_state_t *arc_mfu;
-static arc_state_t *arc_mfu_ghost;
-static arc_state_t *arc_l2c_only;
-
-/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them. For these variables, we therefore define them to be in
- * terms of the statistic variable. This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
-#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
-#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
-#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
-#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
-#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
-#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
-
-/* compressed size of entire arc */
-#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
-/* uncompressed size of entire arc */
-#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
-/* number of bytes in the arc from arc_buf_t's */
-#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
-
-/*
- * There are also some ARC variables that we want to export, but that are
- * updated so often that having the canonical representation be the statistic
- * variable causes a performance bottleneck. We want to use aggsum_t's for these
- * instead, but still be able to export the kstat in the same way as before.
- * The solution is to always use the aggsum version, except in the kstat update
- * callback.
- */
-aggsum_t arc_size;
-aggsum_t arc_meta_used;
-aggsum_t astat_data_size;
-aggsum_t astat_metadata_size;
-aggsum_t astat_hdr_size;
-aggsum_t astat_bonus_size;
-aggsum_t astat_dnode_size;
-aggsum_t astat_dbuf_size;
-aggsum_t astat_l2_hdr_size;
-
-static list_t arc_prune_list;
-static kmutex_t arc_prune_mtx;
-static taskq_t *arc_prune_taskq;
-
-static int arc_no_grow; /* Don't try to grow cache size */
-static hrtime_t arc_growtime;
-static uint64_t arc_tempreserve;
-static uint64_t arc_loaned_bytes;
-
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
- void *acb_private;
- arc_read_done_func_t *acb_done;
- arc_buf_t *acb_buf;
- boolean_t acb_compressed;
- zio_t *acb_zio_dummy;
- zio_t *acb_zio_head;
- arc_callback_t *acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
- void *awcb_private;
- arc_write_done_func_t *awcb_ready;
- arc_write_done_func_t *awcb_children_ready;
- arc_write_done_func_t *awcb_physdone;
- arc_write_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
-};
-
-/*
- * ARC buffers are separated into multiple structs as a memory saving measure:
- * - Common fields struct, always defined, and embedded within it:
- * - L2-only fields, always allocated but undefined when not in L2ARC
- * - L1-only fields, only allocated when in L1ARC
- *
- * Buffer in L1 Buffer only in L2
- * +------------------------+ +------------------------+
- * | arc_buf_hdr_t | | arc_buf_hdr_t |
- * | | | |
- * | | | |
- * | | | |
- * +------------------------+ +------------------------+
- * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
- * | (undefined if L1-only) | | |
- * +------------------------+ +------------------------+
- * | l1arc_buf_hdr_t |
- * | |
- * | |
- * | |
- * | |
- * +------------------------+
- *
- * Because it's possible for the L2ARC to become extremely large, we can wind
- * up eating a lot of memory in L2ARC buffer headers, so the size of a header
- * is minimized by only allocating the fields necessary for an L1-cached buffer
- * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
- * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
- * words in pointers. arc_hdr_realloc() is used to switch a header between
- * these two allocation states.
- */
-typedef struct l1arc_buf_hdr {
- kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
-#ifdef ZFS_DEBUG
- /*
- * Used for debugging with kmem_flags - by allocating and freeing
- * b_thawed when the buffer is thawed, we get a record of the stack
- * trace that thawed it.
- */
- void *b_thawed;
-#endif
-
- arc_buf_t *b_buf;
- uint32_t b_bufcnt;
- /* for waiting on writes to complete */
- kcondvar_t b_cv;
- uint8_t b_byteswap;
-
- /* protected by arc state mutex */
- arc_state_t *b_state;
- multilist_node_t b_arc_node;
-
- /* updated atomically */
- clock_t b_arc_access;
- uint32_t b_mru_hits;
- uint32_t b_mru_ghost_hits;
- uint32_t b_mfu_hits;
- uint32_t b_mfu_ghost_hits;
- uint32_t b_l2_hits;
-
- /* self protecting */
- zfs_refcount_t b_refcnt;
-
- arc_callback_t *b_acb;
- abd_t *b_pabd;
-} l1arc_buf_hdr_t;
-
-typedef struct l2arc_dev l2arc_dev_t;
-
-typedef struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
- uint32_t b_hits;
-
- list_node_t b_l2node;
-} l2arc_buf_hdr_t;
-
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
-
- arc_buf_contents_t b_type;
- arc_buf_hdr_t *b_hash_next;
- arc_flags_t b_flags;
-
- /*
- * This field stores the size of the data buffer after
- * compression, and is set in the arc's zio completion handlers.
- * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
- *
- * While the block pointers can store up to 32MB in their psize
- * field, we can only store up to 32MB minus 512B. This is due
- * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
- * a field of zeros represents 512B in the bp). We can't use a
- * bias of 1 since we need to reserve a psize of zero, here, to
- * represent holes and embedded blocks.
- *
- * This isn't a problem in practice, since the maximum size of a
- * buffer is limited to 16MB, so we never need to store 32MB in
- * this field. Even in the upstream illumos code base, the
- * maximum size of a buffer is limited to 16MB.
- */
- uint16_t b_psize;
-
- /*
- * This field stores the size of the data buffer before
- * compression, and cannot change once set. It is in units
- * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
- */
- uint16_t b_lsize; /* immutable */
- uint64_t b_spa; /* immutable */
-
- /* L2ARC fields. Undefined when not in L2ARC. */
- l2arc_buf_hdr_t b_l2hdr;
- /* L1ARC fields. Undefined when in l2arc_only state */
- l1arc_buf_hdr_t b_l1hdr;
-};
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-static int
-sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
-{
- uint64_t val;
- int err;
-
- val = arc_meta_limit;
- err = sysctl_handle_64(oidp, &val, 0, req);
- if (err != 0 || req->newptr == NULL)
- return (err);
-
- if (val <= 0 || val > arc_c_max)
- return (EINVAL);
-
- arc_meta_limit = val;
-
- mutex_enter(&arc_adjust_lock);
- arc_adjust_needed = B_TRUE;
- mutex_exit(&arc_adjust_lock);
- zthr_wakeup(arc_adjust_zthr);
-
- return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
-{
- uint32_t val;
- int err;
-
- val = arc_no_grow_shift;
- err = sysctl_handle_32(oidp, &val, 0, req);
- if (err != 0 || req->newptr == NULL)
- return (err);
-
- if (val >= arc_shrink_shift)
- return (EINVAL);
-
- arc_no_grow_shift = val;
- return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
-{
- uint64_t val;
- int err;
-
- val = zfs_arc_max;
- err = sysctl_handle_64(oidp, &val, 0, req);
- if (err != 0 || req->newptr == NULL)
- return (err);
-
- if (zfs_arc_max == 0) {
- /* Loader tunable so blindly set */
- zfs_arc_max = val;
- return (0);
- }
-
- if (val < arc_abs_min || val > kmem_size())
- return (EINVAL);
- if (val < arc_c_min)
- return (EINVAL);
- if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
- return (EINVAL);
-
- arc_c_max = val;
-
- arc_c = arc_c_max;
- arc_p = (arc_c >> 1);
-
- if (zfs_arc_meta_limit == 0) {
- /* limit meta-data to 1/4 of the arc capacity */
- arc_meta_limit = arc_c_max / 4;
- }
-
- /* if kmem_flags are set, lets try to use less memory */
- if (kmem_debugging())
- arc_c = arc_c / 2;
-
- zfs_arc_max = arc_c;
-
- mutex_enter(&arc_adjust_lock);
- arc_adjust_needed = B_TRUE;
- mutex_exit(&arc_adjust_lock);
- zthr_wakeup(arc_adjust_zthr);
-
- return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
-{
- uint64_t val;
- int err;
-
- val = zfs_arc_min;
- err = sysctl_handle_64(oidp, &val, 0, req);
- if (err != 0 || req->newptr == NULL)
- return (err);
-
- if (zfs_arc_min == 0) {
- /* Loader tunable so blindly set */
- zfs_arc_min = val;
- return (0);
- }
-
- if (val < arc_abs_min || val > arc_c_max)
- return (EINVAL);
-
- arc_c_min = val;
-
- if (zfs_arc_meta_min == 0)
- arc_meta_min = arc_c_min / 2;
-
- if (arc_c < arc_c_min)
- arc_c = arc_c_min;
-
- zfs_arc_min = arc_c_min;
-
- return (0);
-}
-#endif
-
-#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
- (state) == arc_l2c_only)
-
-#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
-#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
-#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
-#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
-#define HDR_PRESCIENT_PREFETCH(hdr) \
- ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-#define HDR_COMPRESSION_ENABLED(hdr) \
- ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
-
-#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
-#define HDR_L2_READING(hdr) \
- (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
- ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
-#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
-#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
-#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
-#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
-
-#define HDR_ISTYPE_METADATA(hdr) \
- ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
-#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
-
-#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
-#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
-
-/* For storing compression mode in b_flags */
-#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
-
-#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
- HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
-#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
- HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
-
-#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
-#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
-#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
-
-/*
- * Other sizes
- */
-
-#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
-
-/*
- * Hash table routines
- */
-
-#define HT_LOCK_PAD CACHE_LINE_SIZE
-
-struct ht_lock {
- kmutex_t ht_lock;
-#ifdef _KERNEL
- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-#define BUF_LOCKS 256
-typedef struct buf_hash_table {
- uint64_t ht_mask;
- arc_buf_hdr_t **ht_table;
- struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
-} buf_hash_table_t;
-
-static buf_hash_table_t buf_hash_table;
-
-#define BUF_HASH_INDEX(spa, dva, birth) \
- (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(hdr) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
-
-uint64_t zfs_crc64_table[256];
-
-/*
- * Level 2 ARC
- */
-
-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 2 /* num of writes */
-/*
- * If we discover during ARC scan any buffers to be compressed, we boost
- * our headroom for the next scanning cycle by this percentage multiple.
- */
-#define L2ARC_HEADROOM_BOOST 200
-#define L2ARC_FEED_SECS 1 /* caching interval secs */
-#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
-
-#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
-#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
-
-/* L2ARC Performance Tunables */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
-uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN,
- &l2arc_write_max, 0, "max write size");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN,
- &l2arc_write_boost, 0, "extra write during warmup");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN,
- &l2arc_headroom, 0, "number of dev writes");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN,
- &l2arc_feed_secs, 0, "interval seconds");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN,
- &l2arc_feed_min_ms, 0, "min interval milliseconds");
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN,
- &l2arc_noprefetch, 0, "don't cache prefetch bufs");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN,
- &l2arc_feed_again, 0, "turbo warmup");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN,
- &l2arc_norw, 0, "no reads during writes");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
- &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
- &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
- &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of anonymous state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
- &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
- &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
- &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of data in mru state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of data in mru ghost state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
- &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
- &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
- &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of data in mfu state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
- "size of metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
- "size of data in mfu ghost state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
- &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
- &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
- &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
-
-/*
- * L2ARC Internals
- */
-struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- kmutex_t l2ad_mtx; /* lock for buffer list */
- list_t l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
- zfs_refcount_t l2ad_alloc; /* allocated bytes */
-};
-
-static list_t L2ARC_dev_list; /* device list */
-static list_t *l2arc_dev_list; /* device list pointer */
-static kmutex_t l2arc_dev_mtx; /* device list mutex */
-static l2arc_dev_t *l2arc_dev_last; /* last device used */
-static list_t L2ARC_free_on_write; /* free after write buf list */
-static list_t *l2arc_free_on_write; /* free after write list ptr */
-static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
-static uint64_t l2arc_ndev; /* number of devices */
-
-typedef struct l2arc_read_callback {
- arc_buf_hdr_t *l2rcb_hdr; /* read header */
- blkptr_t l2rcb_bp; /* original blkptr */
- zbookmark_phys_t l2rcb_zb; /* original bookmark */
- int l2rcb_flags; /* original flags */
- abd_t *l2rcb_abd; /* temporary buffer */
-} l2arc_read_callback_t;
-
-typedef struct l2arc_write_callback {
- l2arc_dev_t *l2wcb_dev; /* device info */
- arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
-} l2arc_write_callback_t;
-
-typedef struct l2arc_data_free {
- /* protected by l2arc_free_on_write_mtx */
- abd_t *l2df_abd;
- size_t l2df_size;
- arc_buf_contents_t l2df_type;
- list_node_t l2df_list_node;
-} l2arc_data_free_t;
-
-static kmutex_t l2arc_feed_thr_lock;
-static kcondvar_t l2arc_feed_thr_cv;
-static uint8_t l2arc_thread_exit;
-
-static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
-static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
-static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
-static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
-static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
-static void arc_hdr_free_pabd(arc_buf_hdr_t *);
-static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
-static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static boolean_t arc_is_overflowing();
-static void arc_buf_watch(arc_buf_t *);
-static void arc_prune_async(int64_t);
-
-static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
-static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
-static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
-static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
-
-static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
-static void l2arc_read_done(zio_t *);
-
-static void
-l2arc_trim(const arc_buf_hdr_t *hdr)
-{
- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
- ASSERT(HDR_HAS_L2HDR(hdr));
- ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
-
- if (HDR_GET_PSIZE(hdr) != 0) {
- trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
- HDR_GET_PSIZE(hdr), 0);
- }
-}
-
-/*
- * We use Cityhash for this. It's fast, and has good hash properties without
- * requiring any large static buffers.
- */
-static uint64_t
-buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
-{
- return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
-}
-
-#define HDR_EMPTY(hdr) \
- ((hdr)->b_dva.dva_word[0] == 0 && \
- (hdr)->b_dva.dva_word[1] == 0)
-
-#define HDR_EQUAL(spa, dva, birth, hdr) \
- ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
- ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
- ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
-
-static void
-buf_discard_identity(arc_buf_hdr_t *hdr)
-{
- hdr->b_dva.dva_word[0] = 0;
- hdr->b_dva.dva_word[1] = 0;
- hdr->b_birth = 0;
-}
-
-static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
-{
- const dva_t *dva = BP_IDENTITY(bp);
- uint64_t birth = BP_PHYSICAL_BIRTH(bp);
- uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *hdr;
-
- mutex_enter(hash_lock);
- for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
- hdr = hdr->b_hash_next) {
- if (HDR_EQUAL(spa, dva, birth, hdr)) {
- *lockp = hash_lock;
- return (hdr);
- }
- }
- mutex_exit(hash_lock);
- *lockp = NULL;
- return (NULL);
-}
-
-/*
- * Insert an entry into the hash table. If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- * If lockp == NULL, the caller is assumed to already hold the hash lock.
- */
-static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
-{
- uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *fhdr;
- uint32_t i;
-
- ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
- ASSERT(hdr->b_birth != 0);
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
-
- if (lockp != NULL) {
- *lockp = hash_lock;
- mutex_enter(hash_lock);
- } else {
- ASSERT(MUTEX_HELD(hash_lock));
- }
-
- for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
- fhdr = fhdr->b_hash_next, i++) {
- if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
- return (fhdr);
- }
-
- hdr->b_hash_next = buf_hash_table.ht_table[idx];
- buf_hash_table.ht_table[idx] = hdr;
- arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-
- /* collect some hash table performance data */
- if (i > 0) {
- ARCSTAT_BUMP(arcstat_hash_collisions);
- if (i == 1)
- ARCSTAT_BUMP(arcstat_hash_chains);
-
- ARCSTAT_MAX(arcstat_hash_chain_max, i);
- }
-
- ARCSTAT_BUMP(arcstat_hash_elements);
- ARCSTAT_MAXSTAT(arcstat_hash_elements);
-
- return (NULL);
-}
-
-static void
-buf_hash_remove(arc_buf_hdr_t *hdr)
-{
- arc_buf_hdr_t *fhdr, **hdrp;
- uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-
- ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
- ASSERT(HDR_IN_HASH_TABLE(hdr));
-
- hdrp = &buf_hash_table.ht_table[idx];
- while ((fhdr = *hdrp) != hdr) {
- ASSERT3P(fhdr, !=, NULL);
- hdrp = &fhdr->b_hash_next;
- }
- *hdrp = hdr->b_hash_next;
- hdr->b_hash_next = NULL;
- arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-
- /* collect some hash table performance data */
- ARCSTAT_BUMPDOWN(arcstat_hash_elements);
-
- if (buf_hash_table.ht_table[idx] &&
- buf_hash_table.ht_table[idx]->b_hash_next == NULL)
- ARCSTAT_BUMPDOWN(arcstat_hash_chains);
-}
-
-/*
- * Global data structures and functions for the buf kmem cache.
- */
-static kmem_cache_t *hdr_full_cache;
-static kmem_cache_t *hdr_l2only_cache;
-static kmem_cache_t *buf_cache;
-
-static void
-buf_fini(void)
-{
- int i;
-
- kmem_free(buf_hash_table.ht_table,
- (buf_hash_table.ht_mask + 1) * sizeof (void *));
- for (i = 0; i < BUF_LOCKS; i++)
- mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_full_cache);
- kmem_cache_destroy(hdr_l2only_cache);
- kmem_cache_destroy(buf_cache);
-}
-
-/*
- * Constructor callback - called when the cache is empty
- * and a new buf is requested.
- */
-/* ARGSUSED */
-static int
-hdr_full_cons(void *vbuf, void *unused, int kmflag)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- bzero(hdr, HDR_FULL_SIZE);
- cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
- zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
- mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- multilist_link_init(&hdr->b_l1hdr.b_arc_node);
- arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- bzero(hdr, HDR_L2ONLY_SIZE);
- arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-buf_cons(void *vbuf, void *unused, int kmflag)
-{
- arc_buf_t *buf = vbuf;
-
- bzero(buf, sizeof (arc_buf_t));
- mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
-
- return (0);
-}
-
-/*
- * Destructor callback - called when a cached buf is
- * no longer required.
- */
-/* ARGSUSED */
-static void
-hdr_full_dest(void *vbuf, void *unused)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- ASSERT(HDR_EMPTY(hdr));
- cv_destroy(&hdr->b_l1hdr.b_cv);
- zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
- mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
-}
-
-/* ARGSUSED */
-static void
-hdr_l2only_dest(void *vbuf, void *unused)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- ASSERT(HDR_EMPTY(hdr));
- arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
-}
-
-/* ARGSUSED */
-static void
-buf_dest(void *vbuf, void *unused)
-{
- arc_buf_t *buf = vbuf;
-
- mutex_destroy(&buf->b_evict_lock);
- arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
-}
-
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
- dprintf("hdr_recl called\n");
- /*
- * umem calls the reclaim func when we destroy the buf cache,
- * which is after we do arc_fini().
- */
- if (arc_initialized)
- zthr_wakeup(arc_reap_zthr);
-}
-
-static void
-buf_init(void)
-{
- uint64_t *ct;
- uint64_t hsize = 1ULL << 12;
- int i, j;
-
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average block size of zfs_arc_average_blocksize (default 8K).
- * By default, the table will take up
- * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
- */
- while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
- hsize <<= 1;
-retry:
- buf_hash_table.ht_mask = hsize - 1;
- buf_hash_table.ht_table =
- kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
- if (buf_hash_table.ht_table == NULL) {
- ASSERT(hsize > (1ULL << 8));
- hsize >>= 1;
- goto retry;
- }
-
- hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
- 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
- hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
- HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
- NULL, NULL, 0);
- buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
- 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
-
- for (i = 0; i < 256; i++)
- for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
- *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
- for (i = 0; i < BUF_LOCKS; i++) {
- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
-}
-
-/*
- * This is the size that the buf occupies in memory. If the buf is compressed,
- * it will correspond to the compressed size. You should use this method of
- * getting the buf size unless you explicitly need the logical size.
- */
-int32_t
-arc_buf_size(arc_buf_t *buf)
-{
- return (ARC_BUF_COMPRESSED(buf) ?
- HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
-}
-
-int32_t
-arc_buf_lsize(arc_buf_t *buf)
-{
- return (HDR_GET_LSIZE(buf->b_hdr));
-}
-
-enum zio_compress
-arc_get_compression(arc_buf_t *buf)
-{
- return (ARC_BUF_COMPRESSED(buf) ?
- HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
-}
-
-#define ARC_MINTIME (hz>>4) /* 62 ms */
-
-static inline boolean_t
-arc_buf_is_shared(arc_buf_t *buf)
-{
- boolean_t shared = (buf->b_data != NULL &&
- buf->b_hdr->b_l1hdr.b_pabd != NULL &&
- abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
- buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
- IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
- IMPLY(shared, ARC_BUF_SHARED(buf));
- IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
-
- /*
- * It would be nice to assert arc_can_share() too, but the "hdr isn't
- * already being shared" requirement prevents us from doing that.
- */
-
- return (shared);
-}
-
-/*
- * Free the checksum associated with this header. If there is no checksum, this
- * is a no-op.
- */
-static inline void
-arc_cksum_free(arc_buf_hdr_t *hdr)
-{
- ASSERT(HDR_HAS_L1HDR(hdr));
- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
- if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
- kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
- hdr->b_l1hdr.b_freeze_cksum = NULL;
- }
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-}
-
-/*
- * Return true iff at least one of the bufs on hdr is not compressed.
- */
-static boolean_t
-arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
-{
- for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
- if (!ARC_BUF_COMPRESSED(b)) {
- return (B_TRUE);
- }
- }
- return (B_FALSE);
-}
-
-/*
- * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
- * matches the checksum that is stored in the hdr. If there is no checksum,
- * or if the buf is compressed, this is a no-op.
- */
-static void
-arc_cksum_verify(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- zio_cksum_t zc;
-
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- arc_hdr_has_uncompressed_buf(hdr));
- return;
- }
-
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
- if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
- return;
- }
-
- fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
- if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
- panic("buffer modified while frozen!");
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-}
-
-static boolean_t
-arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
-{
- enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
- boolean_t valid_cksum;
-
- ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
- VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
-
- /*
- * We rely on the blkptr's checksum to determine if the block
- * is valid or not. When compressed arc is enabled, the l2arc
- * writes the block to the l2arc just as it appears in the pool.
- * This allows us to use the blkptr's checksum to validate the
- * data that we just read off of the l2arc without having to store
- * a separate checksum in the arc_buf_hdr_t. However, if compressed
- * arc is disabled, then the data written to the l2arc is always
- * uncompressed and won't match the block as it exists in the main
- * pool. When this is the case, we must first compress it if it is
- * compressed on the main pool before we can validate the checksum.
- */
- if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
- uint64_t csize;
-
- abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
- csize = zio_compress_data(compress, zio->io_abd,
- abd_to_buf(cdata), lsize);
-
- ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
- if (csize < HDR_GET_PSIZE(hdr)) {
- /*
- * Compressed blocks are always a multiple of the
- * smallest ashift in the pool. Ideally, we would
- * like to round up the csize to the next
- * spa_min_ashift but that value may have changed
- * since the block was last written. Instead,
- * we rely on the fact that the hdr's psize
- * was set to the psize of the block when it was
- * last written. We set the csize to that value
- * and zero out any part that should not contain
- * data.
- */
- abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
- csize = HDR_GET_PSIZE(hdr);
- }
- zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
- }
-
- /*
- * Block pointers always store the checksum for the logical data.
- * If the block pointer has the gang bit set, then the checksum
- * it represents is for the reconstituted data and not for an
- * individual gang member. The zio pipeline, however, must be able to
- * determine the checksum of each of the gang constituents so it
- * treats the checksum comparison differently than what we need
- * for l2arc blocks. This prevents us from using the
- * zio_checksum_error() interface directly. Instead we must call the
- * zio_checksum_error_impl() so that we can ensure the checksum is
- * generated using the correct checksum algorithm and accounts for the
- * logical I/O size and not just a gang fragment.
- */
- valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
- BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
- zio->io_offset, NULL) == 0);
- zio_pop_transforms(zio);
- return (valid_cksum);
-}
-
-/*
- * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
- * checksum and attaches it to the buf's hdr so that we can ensure that the buf
- * isn't modified later on. If buf is compressed or there is already a checksum
- * on the hdr, this is a no-op (we only checksum uncompressed bufs).
- */
-static void
-arc_cksum_compute(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
- ASSERT(arc_hdr_has_uncompressed_buf(hdr));
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
- return;
- } else if (ARC_BUF_COMPRESSED(buf)) {
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
- return;
- }
-
- ASSERT(!ARC_BUF_COMPRESSED(buf));
- hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
- KM_SLEEP);
- fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
- hdr->b_l1hdr.b_freeze_cksum);
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-#ifdef illumos
- arc_buf_watch(buf);
-#endif
-}
-
-#ifdef illumos
-#ifndef _KERNEL
-typedef struct procctl {
- long cmd;
- prwatch_t prwatch;
-} procctl_t;
-#endif
-
-/* ARGSUSED */
-static void
-arc_buf_unwatch(arc_buf_t *buf)
-{
-#ifndef _KERNEL
- if (arc_watch) {
- int result;
- procctl_t ctl;
- ctl.cmd = PCWATCH;
- ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = 0;
- ctl.prwatch.pr_wflags = 0;
- result = write(arc_procfd, &ctl, sizeof (ctl));
- ASSERT3U(result, ==, sizeof (ctl));
- }
-#endif
-}
-
-/* ARGSUSED */
-static void
-arc_buf_watch(arc_buf_t *buf)
-{
-#ifndef _KERNEL
- if (arc_watch) {
- int result;
- procctl_t ctl;
- ctl.cmd = PCWATCH;
- ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = arc_buf_size(buf);
- ctl.prwatch.pr_wflags = WA_WRITE;
- result = write(arc_procfd, &ctl, sizeof (ctl));
- ASSERT3U(result, ==, sizeof (ctl));
- }
-#endif
-}
-#endif /* illumos */
-
-static arc_buf_contents_t
-arc_buf_type(arc_buf_hdr_t *hdr)
-{
- arc_buf_contents_t type;
- if (HDR_ISTYPE_METADATA(hdr)) {
- type = ARC_BUFC_METADATA;
- } else {
- type = ARC_BUFC_DATA;
- }
- VERIFY3U(hdr->b_type, ==, type);
- return (type);
-}
-
-boolean_t
-arc_is_metadata(arc_buf_t *buf)
-{
- return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
-}
-
-static uint32_t
-arc_bufc_to_flags(arc_buf_contents_t type)
-{
- switch (type) {
- case ARC_BUFC_DATA:
- /* metadata field is 0 if buffer contains normal data */
- return (0);
- case ARC_BUFC_METADATA:
- return (ARC_FLAG_BUFC_METADATA);
- default:
- break;
- }
- panic("undefined ARC buffer type!");
- return ((uint32_t)-1);
-}
-
-void
-arc_buf_thaw(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-
- arc_cksum_verify(buf);
-
- /*
- * Compressed buffers do not manipulate the b_freeze_cksum or
- * allocate b_thawed.
- */
- if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- arc_hdr_has_uncompressed_buf(hdr));
- return;
- }
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- arc_cksum_free(hdr);
-
- mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
-#ifdef ZFS_DEBUG
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (hdr->b_l1hdr.b_thawed != NULL)
- kmem_free(hdr->b_l1hdr.b_thawed, 1);
- hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
- }
-#endif
-
- mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-
-#ifdef illumos
- arc_buf_unwatch(buf);
-#endif
-}
-
-void
-arc_buf_freeze(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock;
-
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- arc_hdr_has_uncompressed_buf(hdr));
- return;
- }
-
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
- hdr->b_l1hdr.b_state == arc_anon);
- arc_cksum_compute(buf);
- mutex_exit(hash_lock);
-}
-
-/*
- * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
- * the following functions should be used to ensure that the flags are
- * updated in a thread-safe way. When manipulating the flags either
- * the hash_lock must be held or the hdr must be undiscoverable. This
- * ensures that we're not racing with any other threads when updating
- * the flags.
- */
-static inline void
-arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
-{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
- hdr->b_flags |= flags;
-}
-
-static inline void
-arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
-{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
- hdr->b_flags &= ~flags;
-}
-
-/*
- * Setting the compression bits in the arc_buf_hdr_t's b_flags is
- * done in a special way since we have to clear and set bits
- * at the same time. Consumers that wish to set the compression bits
- * must use this function to ensure that the flags are updated in
- * thread-safe manner.
- */
-static void
-arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
-{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- /*
- * Holes and embedded blocks will always have a psize = 0 so
- * we ignore the compression of the blkptr and set the
- * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
- * Holes and embedded blocks remain anonymous so we don't
- * want to uncompress them. Mark them as uncompressed.
- */
- if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
- arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
- HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
- ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- } else {
- arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
- HDR_SET_COMPRESS(hdr, cmp);
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
- ASSERT(HDR_COMPRESSION_ENABLED(hdr));
- }
-}
-
-/*
- * Looks for another buf on the same hdr which has the data decompressed, copies
- * from it, and returns true. If no such buf exists, returns false.
- */
-static boolean_t
-arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- boolean_t copied = B_FALSE;
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3P(buf->b_data, !=, NULL);
- ASSERT(!ARC_BUF_COMPRESSED(buf));
-
- for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
- from = from->b_next) {
- /* can't use our own data buffer */
- if (from == buf) {
- continue;
- }
-
- if (!ARC_BUF_COMPRESSED(from)) {
- bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
- copied = B_TRUE;
- break;
- }
- }
-
- /*
- * There were no decompressed bufs, so there should not be a
- * checksum on the hdr either.
- */
- EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
-
- return (copied);
-}
-
-/*
- * Given a buf that has a data buffer attached to it, this function will
- * efficiently fill the buf with data of the specified compression setting from
- * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
- * are already sharing a data buf, no copy is performed.
- *
- * If the buf is marked as compressed but uncompressed data was requested, this
- * will allocate a new data buffer for the buf, remove that flag, and fill the
- * buf with uncompressed data. You can't request a compressed buf on a hdr with
- * uncompressed data, and (since we haven't added support for it yet) if you
- * want compressed data your buf must already be marked as compressed and have
- * the correct-sized data buffer.
- */
-static int
-arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
- dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
-
- ASSERT3P(buf->b_data, !=, NULL);
- IMPLY(compressed, hdr_compressed);
- IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
-
- if (hdr_compressed == compressed) {
- if (!arc_buf_is_shared(buf)) {
- abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
- arc_buf_size(buf));
- }
- } else {
- ASSERT(hdr_compressed);
- ASSERT(!compressed);
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
-
- /*
- * If the buf is sharing its data with the hdr, unlink it and
- * allocate a new data buffer for the buf.
- */
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_COMPRESSED(buf));
-
- /* We need to give the buf it's own b_data */
- buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
- buf->b_data =
- arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
-
- /* Previously overhead was 0; just add new overhead */
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- } else if (ARC_BUF_COMPRESSED(buf)) {
- /* We need to reallocate the buf's b_data */
- arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
- buf);
- buf->b_data =
- arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
-
- /* We increased the size of b_data; update overhead */
- ARCSTAT_INCR(arcstat_overhead_size,
- HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
- }
-
- /*
- * Regardless of the buf's previous compression settings, it
- * should not be compressed at the end of this function.
- */
- buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
-
- /*
- * Try copying the data from another buf which already has a
- * decompressed version. If that's not possible, it's time to
- * bite the bullet and decompress the data from the hdr.
- */
- if (arc_buf_try_copy_decompressed_data(buf)) {
- /* Skip byteswapping and checksumming (already done) */
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
- return (0);
- } else {
- int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pabd, buf->b_data,
- HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
-
- /*
- * Absent hardware errors or software bugs, this should
- * be impossible, but log it anyway so we can debug it.
- */
- if (error != 0) {
- zfs_dbgmsg(
- "hdr %p, compress %d, psize %d, lsize %d",
- hdr, HDR_GET_COMPRESS(hdr),
- HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
- return (SET_ERROR(EIO));
- }
- }
- }
-
- /* Byteswap the buf's data if necessary */
- if (bswap != DMU_BSWAP_NUMFUNCS) {
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
- dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
- }
-
- /* Compute the hdr's checksum if necessary */
- arc_cksum_compute(buf);
-
- return (0);
-}
-
-int
-arc_decompress(arc_buf_t *buf)
-{
- return (arc_buf_fill(buf, B_FALSE));
-}
-
-/*
- * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
- */
-static uint64_t
-arc_hdr_size(arc_buf_hdr_t *hdr)
-{
- uint64_t size;
-
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
- HDR_GET_PSIZE(hdr) > 0) {
- size = HDR_GET_PSIZE(hdr);
- } else {
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
- size = HDR_GET_LSIZE(hdr);
- }
- return (size);
-}
-
-/*
- * Increment the amount of evictable space in the arc_state_t's refcount.
- * We account for the space used by the hdr and the arc buf individually
- * so that we can add and remove them from the refcount individually.
- */
-static void
-arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
-{
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- if (GHOST_STATE(state)) {
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- (void) zfs_refcount_add_many(&state->arcs_esize[type],
- HDR_GET_LSIZE(hdr), hdr);
- return;
- }
-
- ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pabd != NULL) {
- (void) zfs_refcount_add_many(&state->arcs_esize[type],
- arc_hdr_size(hdr), hdr);
- }
- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
- buf = buf->b_next) {
- if (arc_buf_is_shared(buf))
- continue;
- (void) zfs_refcount_add_many(&state->arcs_esize[type],
- arc_buf_size(buf), buf);
- }
-}
-
-/*
- * Decrement the amount of evictable space in the arc_state_t's refcount.
- * We account for the space used by the hdr and the arc buf individually
- * so that we can add and remove them from the refcount individually.
- */
-static void
-arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
-{
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- if (GHOST_STATE(state)) {
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
- HDR_GET_LSIZE(hdr), hdr);
- return;
- }
-
- ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pabd != NULL) {
- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
- arc_hdr_size(hdr), hdr);
- }
- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
- buf = buf->b_next) {
- if (arc_buf_is_shared(buf))
- continue;
- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
- arc_buf_size(buf), buf);
- }
-}
-
-/*
- * Add a reference to this hdr indicating that someone is actively
- * referencing that memory. When the refcount transitions from 0 to 1,
- * we remove it from the respective arc_state_t list to indicate that
- * it is not evictable.
- */
-static void
-add_reference(arc_buf_hdr_t *hdr, void *tag)
-{
- ASSERT(HDR_HAS_L1HDR(hdr));
- if (!MUTEX_HELD(HDR_LOCK(hdr))) {
- ASSERT(hdr->b_l1hdr.b_state == arc_anon);
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- }
-
- arc_state_t *state = hdr->b_l1hdr.b_state;
-
- if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
- (state != arc_anon)) {
- /* We don't use the L2-only state list. */
- if (state != arc_l2c_only) {
- multilist_remove(state->arcs_list[arc_buf_type(hdr)],
- hdr);
- arc_evictable_space_decrement(hdr, state);
- }
- /* remove the prefetch flag if we get a reference */
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
- }
-}
-
-/*
- * Remove a reference from this hdr. When the reference transitions from
- * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
- * list making it eligible for eviction.
- */
-static int
-remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
-{
- int cnt;
- arc_state_t *state = hdr->b_l1hdr.b_state;
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
- ASSERT(!GHOST_STATE(state));
-
- /*
- * arc_l2c_only counts as a ghost state so we don't need to explicitly
- * check to prevent usage of the arc_l2c_only list.
- */
- if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
- (state != arc_anon)) {
- multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
- arc_evictable_space_increment(hdr, state);
- }
- return (cnt);
-}
-
-/*
- * Returns detailed information about a specific arc buffer. When the
- * state_index argument is set the function will calculate the arc header
- * list position for its arc state. Since this requires a linear traversal
- * callers are strongly encourage not to do this. However, it can be helpful
- * for targeted analysis so the functionality is provided.
- */
-void
-arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
-{
- arc_buf_hdr_t *hdr = ab->b_hdr;
- l1arc_buf_hdr_t *l1hdr = NULL;
- l2arc_buf_hdr_t *l2hdr = NULL;
- arc_state_t *state = NULL;
-
- memset(abi, 0, sizeof (arc_buf_info_t));
-
- if (hdr == NULL)
- return;
-
- abi->abi_flags = hdr->b_flags;
-
- if (HDR_HAS_L1HDR(hdr)) {
- l1hdr = &hdr->b_l1hdr;
- state = l1hdr->b_state;
- }
- if (HDR_HAS_L2HDR(hdr))
- l2hdr = &hdr->b_l2hdr;
-
- if (l1hdr) {
- abi->abi_bufcnt = l1hdr->b_bufcnt;
- abi->abi_access = l1hdr->b_arc_access;
- abi->abi_mru_hits = l1hdr->b_mru_hits;
- abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
- abi->abi_mfu_hits = l1hdr->b_mfu_hits;
- abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
- abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
- }
-
- if (l2hdr) {
- abi->abi_l2arc_dattr = l2hdr->b_daddr;
- abi->abi_l2arc_hits = l2hdr->b_hits;
- }
-
- abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
- abi->abi_state_contents = arc_buf_type(hdr);
- abi->abi_size = arc_hdr_size(hdr);
-}
-
-/*
- * Move the supplied buffer to the indicated state. The hash lock
- * for the buffer must be held by the caller.
- */
-static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
- kmutex_t *hash_lock)
-{
- arc_state_t *old_state;
- int64_t refcnt;
- uint32_t bufcnt;
- boolean_t update_old, update_new;
- arc_buf_contents_t buftype = arc_buf_type(hdr);
-
- /*
- * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
- * in arc_read() when bringing a buffer out of the L2ARC. However, the
- * L1 hdr doesn't always exist when we change state to arc_anon before
- * destroying a header, in which case reallocating to add the L1 hdr is
- * pointless.
- */
- if (HDR_HAS_L1HDR(hdr)) {
- old_state = hdr->b_l1hdr.b_state;
- refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
- bufcnt = hdr->b_l1hdr.b_bufcnt;
- update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
- } else {
- old_state = arc_l2c_only;
- refcnt = 0;
- bufcnt = 0;
- update_old = B_FALSE;
- }
- update_new = update_old;
-
- ASSERT(MUTEX_HELD(hash_lock));
- ASSERT3P(new_state, !=, old_state);
- ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
- ASSERT(old_state != arc_anon || bufcnt <= 1);
-
- /*
- * If this buffer is evictable, transfer it from the
- * old state list to the new state list.
- */
- if (refcnt == 0) {
- if (old_state != arc_anon && old_state != arc_l2c_only) {
- ASSERT(HDR_HAS_L1HDR(hdr));
- multilist_remove(old_state->arcs_list[buftype], hdr);
-
- if (GHOST_STATE(old_state)) {
- ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- update_old = B_TRUE;
- }
- arc_evictable_space_decrement(hdr, old_state);
- }
- if (new_state != arc_anon && new_state != arc_l2c_only) {
-
- /*
- * An L1 header always exists here, since if we're
- * moving to some L1-cached state (i.e. not l2c_only or
- * anonymous), we realloc the header to add an L1hdr
- * beforehand.
- */
- ASSERT(HDR_HAS_L1HDR(hdr));
- multilist_insert(new_state->arcs_list[buftype], hdr);
-
- if (GHOST_STATE(new_state)) {
- ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- update_new = B_TRUE;
- }
- arc_evictable_space_increment(hdr, new_state);
- }
- }
-
- ASSERT(!HDR_EMPTY(hdr));
- if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
- buf_hash_remove(hdr);
-
- /* adjust state sizes (ignore arc_l2c_only) */
-
- if (update_new && new_state != arc_l2c_only) {
- ASSERT(HDR_HAS_L1HDR(hdr));
- if (GHOST_STATE(new_state)) {
- ASSERT0(bufcnt);
-
- /*
- * When moving a header to a ghost state, we first
- * remove all arc buffers. Thus, we'll have a
- * bufcnt of zero, and no arc buffer to use for
- * the reference. As a result, we use the arc
- * header pointer for the reference.
- */
- (void) zfs_refcount_add_many(&new_state->arcs_size,
- HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- } else {
- uint32_t buffers = 0;
-
- /*
- * Each individual buffer holds a unique reference,
- * thus we must remove each of these references one
- * at a time.
- */
- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
- buf = buf->b_next) {
- ASSERT3U(bufcnt, !=, 0);
- buffers++;
-
- /*
- * When the arc_buf_t is sharing the data
- * block with the hdr, the owner of the
- * reference belongs to the hdr. Only
- * add to the refcount if the arc_buf_t is
- * not shared.
- */
- if (arc_buf_is_shared(buf))
- continue;
-
- (void) zfs_refcount_add_many(
- &new_state->arcs_size,
- arc_buf_size(buf), buf);
- }
- ASSERT3U(bufcnt, ==, buffers);
-
- if (hdr->b_l1hdr.b_pabd != NULL) {
- (void) zfs_refcount_add_many(
- &new_state->arcs_size,
- arc_hdr_size(hdr), hdr);
- } else {
- ASSERT(GHOST_STATE(old_state));
- }
- }
- }
-
- if (update_old && old_state != arc_l2c_only) {
- ASSERT(HDR_HAS_L1HDR(hdr));
- if (GHOST_STATE(old_state)) {
- ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
- /*
- * When moving a header off of a ghost state,
- * the header will not contain any arc buffers.
- * We use the arc header pointer for the reference
- * which is exactly what we did when we put the
- * header on the ghost state.
- */
-
- (void) zfs_refcount_remove_many(&old_state->arcs_size,
- HDR_GET_LSIZE(hdr), hdr);
- } else {
- uint32_t buffers = 0;
-
- /*
- * Each individual buffer holds a unique reference,
- * thus we must remove each of these references one
- * at a time.
- */
- for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
- buf = buf->b_next) {
- ASSERT3U(bufcnt, !=, 0);
- buffers++;
-
- /*
- * When the arc_buf_t is sharing the data
- * block with the hdr, the owner of the
- * reference belongs to the hdr. Only
- * add to the refcount if the arc_buf_t is
- * not shared.
- */
- if (arc_buf_is_shared(buf))
- continue;
-
- (void) zfs_refcount_remove_many(
- &old_state->arcs_size, arc_buf_size(buf),
- buf);
- }
- ASSERT3U(bufcnt, ==, buffers);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- (void) zfs_refcount_remove_many(
- &old_state->arcs_size, arc_hdr_size(hdr), hdr);
- }
- }
-
- if (HDR_HAS_L1HDR(hdr))
- hdr->b_l1hdr.b_state = new_state;
-
- /*
- * L2 headers should never be on the L2 state list since they don't
- * have L1 headers allocated.
- */
- ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
- multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
-}
-
-void
-arc_space_consume(uint64_t space, arc_space_type_t type)
-{
- ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
-
- switch (type) {
- case ARC_SPACE_DATA:
- aggsum_add(&astat_data_size, space);
- break;
- case ARC_SPACE_META:
- aggsum_add(&astat_metadata_size, space);
- break;
- case ARC_SPACE_BONUS:
- aggsum_add(&astat_bonus_size, space);
- break;
- case ARC_SPACE_DNODE:
- aggsum_add(&astat_dnode_size, space);
- break;
- case ARC_SPACE_DBUF:
- aggsum_add(&astat_dbuf_size, space);
- break;
- case ARC_SPACE_HDRS:
- aggsum_add(&astat_hdr_size, space);
- break;
- case ARC_SPACE_L2HDRS:
- aggsum_add(&astat_l2_hdr_size, space);
- break;
- }
-
- if (type != ARC_SPACE_DATA)
- aggsum_add(&arc_meta_used, space);
-
- aggsum_add(&arc_size, space);
-}
-
-void
-arc_space_return(uint64_t space, arc_space_type_t type)
-{
- ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
-
- switch (type) {
- case ARC_SPACE_DATA:
- aggsum_add(&astat_data_size, -space);
- break;
- case ARC_SPACE_META:
- aggsum_add(&astat_metadata_size, -space);
- break;
- case ARC_SPACE_BONUS:
- aggsum_add(&astat_bonus_size, -space);
- break;
- case ARC_SPACE_DNODE:
- aggsum_add(&astat_dnode_size, -space);
- break;
- case ARC_SPACE_DBUF:
- aggsum_add(&astat_dbuf_size, -space);
- break;
- case ARC_SPACE_HDRS:
- aggsum_add(&astat_hdr_size, -space);
- break;
- case ARC_SPACE_L2HDRS:
- aggsum_add(&astat_l2_hdr_size, -space);
- break;
- }
-
- if (type != ARC_SPACE_DATA) {
- ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
- /*
- * We use the upper bound here rather than the precise value
- * because the arc_meta_max value doesn't need to be
- * precise. It's only consumed by humans via arcstats.
- */
- if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
- arc_meta_max = aggsum_upper_bound(&arc_meta_used);
- aggsum_add(&arc_meta_used, -space);
- }
-
- ASSERT(aggsum_compare(&arc_size, space) >= 0);
- aggsum_add(&arc_size, -space);
-}
-
-/*
- * Given a hdr and a buf, returns whether that buf can share its b_data buffer
- * with the hdr's b_pabd.
- */
-static boolean_t
-arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
- /*
- * The criteria for sharing a hdr's data are:
- * 1. the hdr's compression matches the buf's compression
- * 2. the hdr doesn't need to be byteswapped
- * 3. the hdr isn't already being shared
- * 4. the buf is either compressed or it is the last buf in the hdr list
- *
- * Criterion #4 maintains the invariant that shared uncompressed
- * bufs must be the final buf in the hdr's b_buf list. Reading this, you
- * might ask, "if a compressed buf is allocated first, won't that be the
- * last thing in the list?", but in that case it's impossible to create
- * a shared uncompressed buf anyway (because the hdr must be compressed
- * to have the compressed buf). You might also think that #3 is
- * sufficient to make this guarantee, however it's possible
- * (specifically in the rare L2ARC write race mentioned in
- * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
- * is sharable, but wasn't at the time of its allocation. Rather than
- * allow a new shared uncompressed buf to be created and then shuffle
- * the list around to make it the last element, this simply disallows
- * sharing if the new buf isn't the first to be added.
- */
- ASSERT3P(buf->b_hdr, ==, hdr);
- boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
- boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
- return (buf_compressed == hdr_compressed &&
- hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
- !HDR_SHARED_DATA(hdr) &&
- (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
-}
-
-/*
- * Allocate a buf for this hdr. If you care about the data that's in the hdr,
- * or if you want a compressed buffer, pass those flags in. Returns 0 if the
- * copy was made successfully, or an error code otherwise.
- */
-static int
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
- boolean_t fill, arc_buf_t **ret)
-{
- arc_buf_t *buf;
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
- VERIFY(hdr->b_type == ARC_BUFC_DATA ||
- hdr->b_type == ARC_BUFC_METADATA);
- ASSERT3P(ret, !=, NULL);
- ASSERT3P(*ret, ==, NULL);
-
- buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_next = hdr->b_l1hdr.b_buf;
- buf->b_flags = 0;
-
- add_reference(hdr, tag);
-
- /*
- * We're about to change the hdr's b_flags. We must either
- * hold the hash_lock or be undiscoverable.
- */
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- /*
- * Only honor requests for compressed bufs if the hdr is actually
- * compressed.
- */
- if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
- buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
-
- /*
- * If the hdr's data can be shared then we share the data buffer and
- * set the appropriate bit in the hdr's b_flags to indicate the hdr is
- * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
- * buffer to store the buf's data.
- *
- * There are two additional restrictions here because we're sharing
- * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
- * actively involved in an L2ARC write, because if this buf is used by
- * an arc_write() then the hdr's data buffer will be released when the
- * write completes, even though the L2ARC write might still be using it.
- * Second, the hdr's ABD must be linear so that the buf's user doesn't
- * need to be ABD-aware.
- */
- boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
- abd_is_linear(hdr->b_l1hdr.b_pabd);
-
- /* Set up b_data and sharing */
- if (can_share) {
- buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
- buf->b_flags |= ARC_BUF_FLAG_SHARED;
- arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
- } else {
- buf->b_data =
- arc_get_data_buf(hdr, arc_buf_size(buf), buf);
- ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
- }
- VERIFY3P(buf->b_data, !=, NULL);
-
- hdr->b_l1hdr.b_buf = buf;
- hdr->b_l1hdr.b_bufcnt += 1;
-
- /*
- * If the user wants the data from the hdr, we need to either copy or
- * decompress the data.
- */
- if (fill) {
- return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
- }
-
- return (0);
-}
-
-static char *arc_onloan_tag = "onloan";
-
-static inline void
-arc_loaned_bytes_update(int64_t delta)
-{
- atomic_add_64(&arc_loaned_bytes, delta);
-
- /* assert that it did not wrap around */
- ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
-}
-
-/*
- * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
- * flight data by arc_tempreserve_space() until they are "returned". Loaned
- * buffers must be returned to the arc before they can be used by the DMU or
- * freed.
- */
-arc_buf_t *
-arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
-{
- arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
- is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
-
- arc_loaned_bytes_update(arc_buf_size(buf));
-
- return (buf);
-}
-
-arc_buf_t *
-arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
- enum zio_compress compression_type)
-{
- arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
- psize, lsize, compression_type);
-
- arc_loaned_bytes_update(arc_buf_size(buf));
-
- return (buf);
-}
-
-
-/*
- * Return a loaned arc buffer to the arc.
- */
-void
-arc_return_buf(arc_buf_t *buf, void *tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- ASSERT3P(buf->b_data, !=, NULL);
- ASSERT(HDR_HAS_L1HDR(hdr));
- (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
- (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
-
- arc_loaned_bytes_update(-arc_buf_size(buf));
-}
-
-/* Detach an arc_buf from a dbuf (tag) */
-void
-arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- ASSERT3P(buf->b_data, !=, NULL);
- ASSERT(HDR_HAS_L1HDR(hdr));
- (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
- (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
-
- arc_loaned_bytes_update(arc_buf_size(buf));
-}
-
-static void
-l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
-{
- l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
-
- df->l2df_abd = abd;
- df->l2df_size = size;
- df->l2df_type = type;
- mutex_enter(&l2arc_free_on_write_mtx);
- list_insert_head(l2arc_free_on_write, df);
- mutex_exit(&l2arc_free_on_write_mtx);
-}
-
-static void
-arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
-{
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t size = arc_hdr_size(hdr);
-
- /* protected by hash lock, if in the hash table */
- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT(state != arc_anon && state != arc_l2c_only);
-
- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
- size, hdr);
- }
- (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
- if (type == ARC_BUFC_METADATA) {
- arc_space_return(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- arc_space_return(size, ARC_SPACE_DATA);
- }
-
- l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
-}
-
-/*
- * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
- * data buffer, we transfer the refcount ownership to the hdr and update
- * the appropriate kstats.
- */
-static void
-arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
- arc_state_t *state = hdr->b_l1hdr.b_state;
-
- ASSERT(arc_can_share(hdr, buf));
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- /*
- * Start sharing the data buffer. We transfer the
- * refcount ownership to the hdr since it always owns
- * the refcount whenever an arc_buf_t is shared.
- */
- zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr);
- hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
- abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
- HDR_ISTYPE_METADATA(hdr));
- arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
- buf->b_flags |= ARC_BUF_FLAG_SHARED;
-
- /*
- * Since we've transferred ownership to the hdr we need
- * to increment its compressed and uncompressed kstats and
- * decrement the overhead size.
- */
- ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
- ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
-}
-
-static void
-arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
- arc_state_t *state = hdr->b_l1hdr.b_state;
-
- ASSERT(arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- /*
- * We are no longer sharing this buffer so we need
- * to transfer its ownership to the rightful owner.
- */
- zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf);
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
- abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
- abd_put(hdr->b_l1hdr.b_pabd);
- hdr->b_l1hdr.b_pabd = NULL;
- buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
-
- /*
- * Since the buffer is no longer shared between
- * the arc buf and the hdr, count it as overhead.
- */
- ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
- ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
-}
-
-/*
- * Remove an arc_buf_t from the hdr's buf list and return the last
- * arc_buf_t on the list. If no buffers remain on the list then return
- * NULL.
- */
-static arc_buf_t *
-arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
- arc_buf_t *lastbuf = NULL;
-
- /*
- * Remove the buf from the hdr list and locate the last
- * remaining buffer on the list.
- */
- while (*bufp != NULL) {
- if (*bufp == buf)
- *bufp = buf->b_next;
-
- /*
- * If we've removed a buffer in the middle of
- * the list then update the lastbuf and update
- * bufp.
- */
- if (*bufp != NULL) {
- lastbuf = *bufp;
- bufp = &(*bufp)->b_next;
- }
- }
- buf->b_next = NULL;
- ASSERT3P(lastbuf, !=, buf);
- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
- IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
-
- return (lastbuf);
-}
-
-/*
- * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
- * list and free it.
- */
-static void
-arc_buf_destroy_impl(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- /*
- * Free up the data associated with the buf but only if we're not
- * sharing this with the hdr. If we are sharing it with the hdr, the
- * hdr is responsible for doing the free.
- */
- if (buf->b_data != NULL) {
- /*
- * We're about to change the hdr's b_flags. We must either
- * hold the hash_lock or be undiscoverable.
- */
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
- arc_cksum_verify(buf);
-#ifdef illumos
- arc_buf_unwatch(buf);
-#endif
-
- if (arc_buf_is_shared(buf)) {
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
- } else {
- uint64_t size = arc_buf_size(buf);
- arc_free_data_buf(hdr, buf->b_data, size, buf);
- ARCSTAT_INCR(arcstat_overhead_size, -size);
- }
- buf->b_data = NULL;
-
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
- hdr->b_l1hdr.b_bufcnt -= 1;
- }
-
- arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
-
- if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
- /*
- * If the current arc_buf_t is sharing its data buffer with the
- * hdr, then reassign the hdr's b_pabd to share it with the new
- * buffer at the end of the list. The shared buffer is always
- * the last one on the hdr's buffer list.
- *
- * There is an equivalent case for compressed bufs, but since
- * they aren't guaranteed to be the last buf in the list and
- * that is an exceedingly rare case, we just allow that space be
- * wasted temporarily.
- */
- if (lastbuf != NULL) {
- /* Only one buf can be shared at once */
- VERIFY(!arc_buf_is_shared(lastbuf));
- /* hdr is uncompressed so can't have compressed buf */
- VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
-
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- arc_hdr_free_pabd(hdr);
-
- /*
- * We must setup a new shared block between the
- * last buffer and the hdr. The data would have
- * been allocated by the arc buf so we need to transfer
- * ownership to the hdr since it's now being shared.
- */
- arc_share_buf(hdr, lastbuf);
- }
- } else if (HDR_SHARED_DATA(hdr)) {
- /*
- * Uncompressed shared buffers are always at the end
- * of the list. Compressed buffers don't have the
- * same requirements. This makes it hard to
- * simply assert that the lastbuf is shared so
- * we rely on the hdr's compression flags to determine
- * if we have a compressed, shared buffer.
- */
- ASSERT3P(lastbuf, !=, NULL);
- ASSERT(arc_buf_is_shared(lastbuf) ||
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
- }
-
- /*
- * Free the checksum if we're removing the last uncompressed buf from
- * this hdr.
- */
- if (!arc_hdr_has_uncompressed_buf(hdr)) {
- arc_cksum_free(hdr);
- }
-
- /* clean up the buf */
- buf->b_hdr = NULL;
- kmem_cache_free(buf_cache, buf);
-}
-
-static void
-arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt)
-{
- ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(!HDR_SHARED_DATA(hdr));
-
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt);
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-
- ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
- ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
-}
-
-static void
-arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
-{
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-
- /*
- * If the hdr is currently being written to the l2arc then
- * we defer freeing the data by adding it to the l2arc_free_on_write
- * list. The l2arc will free the data once it's finished
- * writing it to the l2arc device.
- */
- if (HDR_L2_WRITING(hdr)) {
- arc_hdr_free_on_write(hdr);
- ARCSTAT_BUMP(arcstat_l2_free_on_write);
- } else {
- arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
- arc_hdr_size(hdr), hdr);
- }
- hdr->b_l1hdr.b_pabd = NULL;
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-
- ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
- ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
-}
-
-static arc_buf_hdr_t *
-arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
- enum zio_compress compression_type, arc_buf_contents_t type)
-{
- arc_buf_hdr_t *hdr;
-
- VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
-
- hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
- ASSERT(HDR_EMPTY(hdr));
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
- HDR_SET_PSIZE(hdr, psize);
- HDR_SET_LSIZE(hdr, lsize);
- hdr->b_spa = spa;
- hdr->b_type = type;
- hdr->b_flags = 0;
- arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
- arc_hdr_set_compress(hdr, compression_type);
-
- hdr->b_l1hdr.b_state = arc_anon;
- hdr->b_l1hdr.b_arc_access = 0;
- hdr->b_l1hdr.b_bufcnt = 0;
- hdr->b_l1hdr.b_buf = NULL;
-
- /*
- * Allocate the hdr's buffer. This will contain either
- * the compressed or uncompressed data depending on the block
- * it references and compressed arc enablement.
- */
- arc_hdr_alloc_pabd(hdr, B_TRUE);
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-
- return (hdr);
-}
-
-/*
- * Transition between the two allocation states for the arc_buf_hdr struct.
- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
- * version is used when a cache buffer is only in the L2ARC in order to reduce
- * memory usage.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
-{
- ASSERT(HDR_HAS_L2HDR(hdr));
-
- arc_buf_hdr_t *nhdr;
- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
- ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
- (old == hdr_l2only_cache && new == hdr_full_cache));
-
- nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
-
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
- buf_hash_remove(hdr);
-
- bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
-
- if (new == hdr_full_cache) {
- arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
- /*
- * arc_access and arc_change_state need to be aware that a
- * header has just come out of L2ARC, so we set its state to
- * l2c_only even though it's about to change.
- */
- nhdr->b_l1hdr.b_state = arc_l2c_only;
-
- /* Verify previous threads set to NULL before freeing */
- ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
- } else {
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
- /*
- * If we've reached here, We must have been called from
- * arc_evict_hdr(), as such we should have already been
- * removed from any ghost list we were previously on
- * (which protects us from racing with arc_evict_state),
- * thus no locking is needed during this check.
- */
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-
- /*
- * A buffer must not be moved into the arc_l2c_only
- * state if it's not finished being written out to the
- * l2arc device. Otherwise, the b_l1hdr.b_pabd field
- * might try to be accessed, even though it was removed.
- */
- VERIFY(!HDR_L2_WRITING(hdr));
- VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
-#ifdef ZFS_DEBUG
- if (hdr->b_l1hdr.b_thawed != NULL) {
- kmem_free(hdr->b_l1hdr.b_thawed, 1);
- hdr->b_l1hdr.b_thawed = NULL;
- }
-#endif
-
- arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
- }
- /*
- * The header has been reallocated so we need to re-insert it into any
- * lists it was on.
- */
- (void) buf_hash_insert(nhdr, NULL);
-
- ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
-
- mutex_enter(&dev->l2ad_mtx);
-
- /*
- * We must place the realloc'ed header back into the list at
- * the same spot. Otherwise, if it's placed earlier in the list,
- * l2arc_write_buffers() could find it during the function's
- * write phase, and try to write it out to the l2arc.
- */
- list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
- list_remove(&dev->l2ad_buflist, hdr);
-
- mutex_exit(&dev->l2ad_mtx);
-
- /*
- * Since we're using the pointer address as the tag when
- * incrementing and decrementing the l2ad_alloc refcount, we
- * must remove the old pointer (that we're about to destroy) and
- * add the new pointer to the refcount. Otherwise we'd remove
- * the wrong pointer address when calling arc_hdr_destroy() later.
- */
-
- (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
- hdr);
- (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
- nhdr);
-
- buf_discard_identity(hdr);
- kmem_cache_free(old, hdr);
-
- return (nhdr);
-}
-
-/*
- * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
- * The buf is returned thawed since we expect the consumer to modify it.
- */
-arc_buf_t *
-arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
-{
- arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
- ZIO_COMPRESS_OFF, type);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
-
- arc_buf_t *buf = NULL;
- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
- arc_buf_thaw(buf);
-
- return (buf);
-}
-
-/*
- * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
- * for bufs containing metadata.
- */
-arc_buf_t *
-arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
- enum zio_compress compression_type)
-{
- ASSERT3U(lsize, >, 0);
- ASSERT3U(lsize, >=, psize);
- ASSERT(compression_type > ZIO_COMPRESS_OFF);
- ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
-
- arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
- compression_type, ARC_BUFC_DATA);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
-
- arc_buf_t *buf = NULL;
- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
- arc_buf_thaw(buf);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
- if (!arc_buf_is_shared(buf)) {
- /*
- * To ensure that the hdr has the correct data in it if we call
- * arc_decompress() on this buf before it's been written to
- * disk, it's easiest if we just set up sharing between the
- * buf and the hdr.
- */
- ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
- arc_hdr_free_pabd(hdr);
- arc_share_buf(hdr, buf);
- }
-
- return (buf);
-}
-
-static void
-arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
-{
- l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
- l2arc_dev_t *dev = l2hdr->b_dev;
- uint64_t psize = arc_hdr_size(hdr);
-
- ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
- ASSERT(HDR_HAS_L2HDR(hdr));
-
- list_remove(&dev->l2ad_buflist, hdr);
-
- ARCSTAT_INCR(arcstat_l2_psize, -psize);
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
- vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
-
- (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
- arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
-}
-
-static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
-{
- if (HDR_HAS_L1HDR(hdr)) {
- ASSERT(hdr->b_l1hdr.b_buf == NULL ||
- hdr->b_l1hdr.b_bufcnt > 0);
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
- }
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
-
- if (!HDR_EMPTY(hdr))
- buf_discard_identity(hdr);
-
- if (HDR_HAS_L2HDR(hdr)) {
- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
- boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
-
- if (!buflist_held)
- mutex_enter(&dev->l2ad_mtx);
-
- /*
- * Even though we checked this conditional above, we
- * need to check this again now that we have the
- * l2ad_mtx. This is because we could be racing with
- * another thread calling l2arc_evict() which might have
- * destroyed this header's L2 portion as we were waiting
- * to acquire the l2ad_mtx. If that happens, we don't
- * want to re-destroy the header's L2 portion.
- */
- if (HDR_HAS_L2HDR(hdr)) {
- l2arc_trim(hdr);
- arc_hdr_l2hdr_destroy(hdr);
- }
-
- if (!buflist_held)
- mutex_exit(&dev->l2ad_mtx);
- }
-
- if (HDR_HAS_L1HDR(hdr)) {
- arc_cksum_free(hdr);
-
- while (hdr->b_l1hdr.b_buf != NULL)
- arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
-
-#ifdef ZFS_DEBUG
- if (hdr->b_l1hdr.b_thawed != NULL) {
- kmem_free(hdr->b_l1hdr.b_thawed, 1);
- hdr->b_l1hdr.b_thawed = NULL;
- }
-#endif
-
- if (hdr->b_l1hdr.b_pabd != NULL) {
- arc_hdr_free_pabd(hdr);
- }
- }
-
- ASSERT3P(hdr->b_hash_next, ==, NULL);
- if (HDR_HAS_L1HDR(hdr)) {
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
- kmem_cache_free(hdr_full_cache, hdr);
- } else {
- kmem_cache_free(hdr_l2only_cache, hdr);
- }
-}
-
-void
-arc_buf_destroy(arc_buf_t *buf, void* tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
-
- if (hdr->b_l1hdr.b_state == arc_anon) {
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- VERIFY0(remove_reference(hdr, NULL, tag));
- arc_hdr_destroy(hdr);
- return;
- }
-
- mutex_enter(hash_lock);
- ASSERT3P(hdr, ==, buf->b_hdr);
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
- ASSERT3P(buf->b_data, !=, NULL);
-
- (void) remove_reference(hdr, hash_lock, tag);
- arc_buf_destroy_impl(buf);
- mutex_exit(hash_lock);
-}
-
-/*
- * Evict the arc_buf_hdr that is provided as a parameter. The resultant
- * state of the header is dependent on its state prior to entering this
- * function. The following transitions are possible:
- *
- * - arc_mru -> arc_mru_ghost
- * - arc_mfu -> arc_mfu_ghost
- * - arc_mru_ghost -> arc_l2c_only
- * - arc_mru_ghost -> deleted
- * - arc_mfu_ghost -> arc_l2c_only
- * - arc_mfu_ghost -> deleted
- */
-static int64_t
-arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
-{
- arc_state_t *evicted_state, *state;
- int64_t bytes_evicted = 0;
- int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
- zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
-
- ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- state = hdr->b_l1hdr.b_state;
- if (GHOST_STATE(state)) {
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-
- /*
- * l2arc_write_buffers() relies on a header's L1 portion
- * (i.e. its b_pabd field) during it's write phase.
- * Thus, we cannot push a header onto the arc_l2c_only
- * state (removing it's L1 piece) until the header is
- * done being written to the l2arc.
- */
- if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
- ARCSTAT_BUMP(arcstat_evict_l2_skip);
- return (bytes_evicted);
- }
-
- ARCSTAT_BUMP(arcstat_deleted);
- bytes_evicted += HDR_GET_LSIZE(hdr);
-
- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
-
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- if (HDR_HAS_L2HDR(hdr)) {
- /*
- * This buffer is cached on the 2nd Level ARC;
- * don't destroy the header.
- */
- arc_change_state(arc_l2c_only, hdr, hash_lock);
- /*
- * dropping from L1+L2 cached to L2-only,
- * realloc to remove the L1 header.
- */
- hdr = arc_hdr_realloc(hdr, hdr_full_cache,
- hdr_l2only_cache);
- } else {
- arc_change_state(arc_anon, hdr, hash_lock);
- arc_hdr_destroy(hdr);
- }
- return (bytes_evicted);
- }
-
- ASSERT(state == arc_mru || state == arc_mfu);
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
- /* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(hdr) ||
- ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
- ARCSTAT_BUMP(arcstat_evict_skip);
- return (bytes_evicted);
- }
-
- ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
- while (hdr->b_l1hdr.b_buf) {
- arc_buf_t *buf = hdr->b_l1hdr.b_buf;
- if (!mutex_tryenter(&buf->b_evict_lock)) {
- ARCSTAT_BUMP(arcstat_mutex_miss);
- break;
- }
- if (buf->b_data != NULL)
- bytes_evicted += HDR_GET_LSIZE(hdr);
- mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy_impl(buf);
- }
-
- if (HDR_HAS_L2HDR(hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
- } else {
- if (l2arc_write_eligible(hdr->b_spa, hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_eligible,
- HDR_GET_LSIZE(hdr));
- } else {
- ARCSTAT_INCR(arcstat_evict_l2_ineligible,
- HDR_GET_LSIZE(hdr));
- }
- }
-
- if (hdr->b_l1hdr.b_bufcnt == 0) {
- arc_cksum_free(hdr);
-
- bytes_evicted += arc_hdr_size(hdr);
-
- /*
- * If this hdr is being evicted and has a compressed
- * buffer then we discard it here before we change states.
- * This ensures that the accounting is updated correctly
- * in arc_free_data_impl().
- */
- arc_hdr_free_pabd(hdr);
-
- arc_change_state(evicted_state, hdr, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(hdr));
- arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
- }
-
- return (bytes_evicted);
-}
-
-static uint64_t
-arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
- uint64_t spa, int64_t bytes)
-{
- multilist_sublist_t *mls;
- uint64_t bytes_evicted = 0;
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
- int evict_count = 0;
-
- ASSERT3P(marker, !=, NULL);
- IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
-
- mls = multilist_sublist_lock(ml, idx);
-
- for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
- hdr = multilist_sublist_prev(mls, marker)) {
- if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
- (evict_count >= zfs_arc_evict_batch_limit))
- break;
-
- /*
- * To keep our iteration location, move the marker
- * forward. Since we're not holding hdr's hash lock, we
- * must be very careful and not remove 'hdr' from the
- * sublist. Otherwise, other consumers might mistake the
- * 'hdr' as not being on a sublist when they call the
- * multilist_link_active() function (they all rely on
- * the hash lock protecting concurrent insertions and
- * removals). multilist_sublist_move_forward() was
- * specifically implemented to ensure this is the case
- * (only 'marker' will be removed and re-inserted).
- */
- multilist_sublist_move_forward(mls, marker);
-
- /*
- * The only case where the b_spa field should ever be
- * zero, is the marker headers inserted by
- * arc_evict_state(). It's possible for multiple threads
- * to be calling arc_evict_state() concurrently (e.g.
- * dsl_pool_close() and zio_inject_fault()), so we must
- * skip any markers we see from these other threads.
- */
- if (hdr->b_spa == 0)
- continue;
-
- /* we're only interested in evicting buffers of a certain spa */
- if (spa != 0 && hdr->b_spa != spa) {
- ARCSTAT_BUMP(arcstat_evict_skip);
- continue;
- }
-
- hash_lock = HDR_LOCK(hdr);
-
- /*
- * We aren't calling this function from any code path
- * that would already be holding a hash lock, so we're
- * asserting on this assumption to be defensive in case
- * this ever changes. Without this check, it would be
- * possible to incorrectly increment arcstat_mutex_miss
- * below (e.g. if the code changed such that we called
- * this function with a hash lock held).
- */
- ASSERT(!MUTEX_HELD(hash_lock));
-
- if (mutex_tryenter(hash_lock)) {
- uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
- mutex_exit(hash_lock);
-
- bytes_evicted += evicted;
-
- /*
- * If evicted is zero, arc_evict_hdr() must have
- * decided to skip this header, don't increment
- * evict_count in this case.
- */
- if (evicted != 0)
- evict_count++;
-
- /*
- * If arc_size isn't overflowing, signal any
- * threads that might happen to be waiting.
- *
- * For each header evicted, we wake up a single
- * thread. If we used cv_broadcast, we could
- * wake up "too many" threads causing arc_size
- * to significantly overflow arc_c; since
- * arc_get_data_impl() doesn't check for overflow
- * when it's woken up (it doesn't because it's
- * possible for the ARC to be overflowing while
- * full of un-evictable buffers, and the
- * function should proceed in this case).
- *
- * If threads are left sleeping, due to not
- * using cv_broadcast here, they will be woken
- * up via cv_broadcast in arc_adjust_cb() just
- * before arc_adjust_zthr sleeps.
- */
- mutex_enter(&arc_adjust_lock);
- if (!arc_is_overflowing())
- cv_signal(&arc_adjust_waiters_cv);
- mutex_exit(&arc_adjust_lock);
- } else {
- ARCSTAT_BUMP(arcstat_mutex_miss);
- }
- }
-
- multilist_sublist_unlock(mls);
-
- return (bytes_evicted);
-}
-
-/*
- * Evict buffers from the given arc state, until we've removed the
- * specified number of bytes. Move the removed buffers to the
- * appropriate evict state.
- *
- * This function makes a "best effort". It skips over any buffers
- * it can't get a hash_lock on, and so, may not catch all candidates.
- * It may also return without evicting as much space as requested.
- *
- * If bytes is specified using the special value ARC_EVICT_ALL, this
- * will evict all available (i.e. unlocked and evictable) buffers from
- * the given arc state; which is used by arc_flush().
- */
-static uint64_t
-arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
- arc_buf_contents_t type)
-{
- uint64_t total_evicted = 0;
- multilist_t *ml = state->arcs_list[type];
- int num_sublists;
- arc_buf_hdr_t **markers;
-
- IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
-
- num_sublists = multilist_get_num_sublists(ml);
-
- /*
- * If we've tried to evict from each sublist, made some
- * progress, but still have not hit the target number of bytes
- * to evict, we want to keep trying. The markers allow us to
- * pick up where we left off for each individual sublist, rather
- * than starting from the tail each time.
- */
- markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
- for (int i = 0; i < num_sublists; i++) {
- markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
- /*
- * A b_spa of 0 is used to indicate that this header is
- * a marker. This fact is used in arc_adjust_type() and
- * arc_evict_state_impl().
- */
- markers[i]->b_spa = 0;
-
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
- multilist_sublist_insert_tail(mls, markers[i]);
- multilist_sublist_unlock(mls);
- }
-
- /*
- * While we haven't hit our target number of bytes to evict, or
- * we're evicting all available buffers.
- */
- while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
- int sublist_idx = multilist_get_random_index(ml);
- uint64_t scan_evicted = 0;
-
- /*
- * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
- * Request that 10% of the LRUs be scanned by the superblock
- * shrinker.
- */
- if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
- arc_dnode_limit) > 0) {
- arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
- arc_dnode_limit) / sizeof (dnode_t) /
- zfs_arc_dnode_reduce_percent);
- }
-
- /*
- * Start eviction using a randomly selected sublist,
- * this is to try and evenly balance eviction across all
- * sublists. Always starting at the same sublist
- * (e.g. index 0) would cause evictions to favor certain
- * sublists over others.
- */
- for (int i = 0; i < num_sublists; i++) {
- uint64_t bytes_remaining;
- uint64_t bytes_evicted;
-
- if (bytes == ARC_EVICT_ALL)
- bytes_remaining = ARC_EVICT_ALL;
- else if (total_evicted < bytes)
- bytes_remaining = bytes - total_evicted;
- else
- break;
-
- bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
- markers[sublist_idx], spa, bytes_remaining);
-
- scan_evicted += bytes_evicted;
- total_evicted += bytes_evicted;
-
- /* we've reached the end, wrap to the beginning */
- if (++sublist_idx >= num_sublists)
- sublist_idx = 0;
- }
-
- /*
- * If we didn't evict anything during this scan, we have
- * no reason to believe we'll evict more during another
- * scan, so break the loop.
- */
- if (scan_evicted == 0) {
- /* This isn't possible, let's make that obvious */
- ASSERT3S(bytes, !=, 0);
-
- /*
- * When bytes is ARC_EVICT_ALL, the only way to
- * break the loop is when scan_evicted is zero.
- * In that case, we actually have evicted enough,
- * so we don't want to increment the kstat.
- */
- if (bytes != ARC_EVICT_ALL) {
- ASSERT3S(total_evicted, <, bytes);
- ARCSTAT_BUMP(arcstat_evict_not_enough);
- }
-
- break;
- }
- }
-
- for (int i = 0; i < num_sublists; i++) {
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
- multilist_sublist_remove(mls, markers[i]);
- multilist_sublist_unlock(mls);
-
- kmem_cache_free(hdr_full_cache, markers[i]);
- }
- kmem_free(markers, sizeof (*markers) * num_sublists);
-
- return (total_evicted);
-}
-
-/*
- * Flush all "evictable" data of the given type from the arc state
- * specified. This will not evict any "active" buffers (i.e. referenced).
- *
- * When 'retry' is set to B_FALSE, the function will make a single pass
- * over the state and evict any buffers that it can. Since it doesn't
- * continually retry the eviction, it might end up leaving some buffers
- * in the ARC due to lock misses.
- *
- * When 'retry' is set to B_TRUE, the function will continually retry the
- * eviction until *all* evictable buffers have been removed from the
- * state. As a result, if concurrent insertions into the state are
- * allowed (e.g. if the ARC isn't shutting down), this function might
- * wind up in an infinite loop, continually trying to evict buffers.
- */
-static uint64_t
-arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
- boolean_t retry)
-{
- uint64_t evicted = 0;
-
- while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
- evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
-
- if (!retry)
- break;
- }
-
- return (evicted);
-}
-
-/*
- * Helper function for arc_prune_async() it is responsible for safely
- * handling the execution of a registered arc_prune_func_t.
- */
-static void
-arc_prune_task(void *ptr)
-{
- arc_prune_t *ap = (arc_prune_t *)ptr;
- arc_prune_func_t *func = ap->p_pfunc;
-
- if (func != NULL)
- func(ap->p_adjust, ap->p_private);
-
- zfs_refcount_remove(&ap->p_refcnt, func);
-}
-
-/*
- * Notify registered consumers they must drop holds on a portion of the ARC
- * buffered they reference. This provides a mechanism to ensure the ARC can
- * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
- * is analogous to dnlc_reduce_cache() but more generic.
- *
- * This operation is performed asynchronously so it may be safely called
- * in the context of the arc_reclaim_thread(). A reference is taken here
- * for each registered arc_prune_t and the arc_prune_task() is responsible
- * for releasing it once the registered arc_prune_func_t has completed.
- */
-static void
-arc_prune_async(int64_t adjust)
-{
- arc_prune_t *ap;
-
- mutex_enter(&arc_prune_mtx);
- for (ap = list_head(&arc_prune_list); ap != NULL;
- ap = list_next(&arc_prune_list, ap)) {
-
- if (zfs_refcount_count(&ap->p_refcnt) >= 2)
- continue;
-
- zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
- ap->p_adjust = adjust;
- if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
- ap, TQ_SLEEP) == TASKQID_INVALID) {
- zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
- continue;
- }
- ARCSTAT_BUMP(arcstat_prune);
- }
- mutex_exit(&arc_prune_mtx);
-}
-
-/*
- * Evict the specified number of bytes from the state specified,
- * restricting eviction to the spa and type given. This function
- * prevents us from trying to evict more from a state's list than
- * is "evictable", and to skip evicting altogether when passed a
- * negative value for "bytes". In contrast, arc_evict_state() will
- * evict everything it can, when passed a negative value for "bytes".
- */
-static uint64_t
-arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
- arc_buf_contents_t type)
-{
- int64_t delta;
-
- if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
- delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
- bytes);
- return (arc_evict_state(state, spa, delta, type));
- }
-
- return (0);
-}
-
-/*
- * The goal of this function is to evict enough meta data buffers from the
- * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
- * more complicated than it appears because it is common for data buffers
- * to have holds on meta data buffers. In addition, dnode meta data buffers
- * will be held by the dnodes in the block preventing them from being freed.
- * This means we can't simply traverse the ARC and expect to always find
- * enough unheld meta data buffer to release.
- *
- * Therefore, this function has been updated to make alternating passes
- * over the ARC releasing data buffers and then newly unheld meta data
- * buffers. This ensures forward progress is maintained and meta_used
- * will decrease. Normally this is sufficient, but if required the ARC
- * will call the registered prune callbacks causing dentry and inodes to
- * be dropped from the VFS cache. This will make dnode meta data buffers
- * available for reclaim.
- */
-static uint64_t
-arc_adjust_meta_balanced(uint64_t meta_used)
-{
- int64_t delta, prune = 0, adjustmnt;
- uint64_t total_evicted = 0;
- arc_buf_contents_t type = ARC_BUFC_DATA;
- int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
-
-restart:
- /*
- * This slightly differs than the way we evict from the mru in
- * arc_adjust because we don't have a "target" value (i.e. no
- * "meta" arc_p). As a result, I think we can completely
- * cannibalize the metadata in the MRU before we evict the
- * metadata from the MFU. I think we probably need to implement a
- * "metadata arc_p" value to do this properly.
- */
- adjustmnt = meta_used - arc_meta_limit;
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
- delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
- adjustmnt);
- total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
- adjustmnt -= delta;
- }
-
- /*
- * We can't afford to recalculate adjustmnt here. If we do,
- * new metadata buffers can sneak into the MRU or ANON lists,
- * thus penalize the MFU metadata. Although the fudge factor is
- * small, it has been empirically shown to be significant for
- * certain workloads (e.g. creating many empty directories). As
- * such, we use the original calculation for adjustmnt, and
- * simply decrement the amount of data evicted from the MRU.
- */
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
- delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
- adjustmnt);
- total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
- }
-
- adjustmnt = meta_used - arc_meta_limit;
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
- delta = MIN(adjustmnt,
- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
- total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
- adjustmnt -= delta;
- }
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
- delta = MIN(adjustmnt,
- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
- total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
- }
-
- /*
- * If after attempting to make the requested adjustment to the ARC
- * the meta limit is still being exceeded then request that the
- * higher layers drop some cached objects which have holds on ARC
- * meta buffers. Requests to the upper layers will be made with
- * increasingly large scan sizes until the ARC is below the limit.
- */
- if (meta_used > arc_meta_limit) {
- if (type == ARC_BUFC_DATA) {
- type = ARC_BUFC_METADATA;
- } else {
- type = ARC_BUFC_DATA;
-
- if (zfs_arc_meta_prune) {
- prune += zfs_arc_meta_prune;
- arc_prune_async(prune);
- }
- }
-
- if (restarts > 0) {
- restarts--;
- goto restart;
- }
- }
- return (total_evicted);
-}
-
-/*
- * Evict metadata buffers from the cache, such that arc_meta_used is
- * capped by the arc_meta_limit tunable.
- */
-static uint64_t
-arc_adjust_meta_only(uint64_t meta_used)
-{
- uint64_t total_evicted = 0;
- int64_t target;
-
- /*
- * If we're over the meta limit, we want to evict enough
- * metadata to get back under the meta limit. We don't want to
- * evict so much that we drop the MRU below arc_p, though. If
- * we're over the meta limit more than we're over arc_p, we
- * evict some from the MRU here, and some from the MFU below.
- */
- target = MIN((int64_t)(meta_used - arc_meta_limit),
- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
-
- total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-
- /*
- * Similar to the above, we want to evict enough bytes to get us
- * below the meta limit, but not so much as to drop us below the
- * space allotted to the MFU (which is defined as arc_c - arc_p).
- */
- target = MIN((int64_t)(meta_used - arc_meta_limit),
- (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
- (arc_c - arc_p)));
-
- total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-
- return (total_evicted);
-}
-
-static uint64_t
-arc_adjust_meta(uint64_t meta_used)
-{
- if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
- return (arc_adjust_meta_only(meta_used));
- else
- return (arc_adjust_meta_balanced(meta_used));
-}
-
-/*
- * Return the type of the oldest buffer in the given arc state
- *
- * This function will select a random sublist of type ARC_BUFC_DATA and
- * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
- * is compared, and the type which contains the "older" buffer will be
- * returned.
- */
-static arc_buf_contents_t
-arc_adjust_type(arc_state_t *state)
-{
- multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
- multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
- int data_idx = multilist_get_random_index(data_ml);
- int meta_idx = multilist_get_random_index(meta_ml);
- multilist_sublist_t *data_mls;
- multilist_sublist_t *meta_mls;
- arc_buf_contents_t type;
- arc_buf_hdr_t *data_hdr;
- arc_buf_hdr_t *meta_hdr;
-
- /*
- * We keep the sublist lock until we're finished, to prevent
- * the headers from being destroyed via arc_evict_state().
- */
- data_mls = multilist_sublist_lock(data_ml, data_idx);
- meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
-
- /*
- * These two loops are to ensure we skip any markers that
- * might be at the tail of the lists due to arc_evict_state().
- */
-
- for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
- data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
- if (data_hdr->b_spa != 0)
- break;
- }
-
- for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
- meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
- if (meta_hdr->b_spa != 0)
- break;
- }
-
- if (data_hdr == NULL && meta_hdr == NULL) {
- type = ARC_BUFC_DATA;
- } else if (data_hdr == NULL) {
- ASSERT3P(meta_hdr, !=, NULL);
- type = ARC_BUFC_METADATA;
- } else if (meta_hdr == NULL) {
- ASSERT3P(data_hdr, !=, NULL);
- type = ARC_BUFC_DATA;
- } else {
- ASSERT3P(data_hdr, !=, NULL);
- ASSERT3P(meta_hdr, !=, NULL);
-
- /* The headers can't be on the sublist without an L1 header */
- ASSERT(HDR_HAS_L1HDR(data_hdr));
- ASSERT(HDR_HAS_L1HDR(meta_hdr));
-
- if (data_hdr->b_l1hdr.b_arc_access <
- meta_hdr->b_l1hdr.b_arc_access) {
- type = ARC_BUFC_DATA;
- } else {
- type = ARC_BUFC_METADATA;
- }
- }
-
- multilist_sublist_unlock(meta_mls);
- multilist_sublist_unlock(data_mls);
-
- return (type);
-}
-
-/*
- * Evict buffers from the cache, such that arc_size is capped by arc_c.
- */
-static uint64_t
-arc_adjust(void)
-{
- uint64_t total_evicted = 0;
- uint64_t bytes;
- int64_t target;
- uint64_t asize = aggsum_value(&arc_size);
- uint64_t ameta = aggsum_value(&arc_meta_used);
-
- /*
- * If we're over arc_meta_limit, we want to correct that before
- * potentially evicting data buffers below.
- */
- total_evicted += arc_adjust_meta(ameta);
-
- /*
- * Adjust MRU size
- *
- * If we're over the target cache size, we want to evict enough
- * from the list to get back to our target size. We don't want
- * to evict too much from the MRU, such that it drops below
- * arc_p. So, if we're over our target cache size more than
- * the MRU is over arc_p, we'll evict enough to get back to
- * arc_p here, and then evict more from the MFU below.
- */
- target = MIN((int64_t)(asize - arc_c),
- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
-
- /*
- * If we're below arc_meta_min, always prefer to evict data.
- * Otherwise, try to satisfy the requested number of bytes to
- * evict from the type which contains older buffers; in an
- * effort to keep newer buffers in the cache regardless of their
- * type. If we cannot satisfy the number of bytes from this
- * type, spill over into the next type.
- */
- if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
- ameta > arc_meta_min) {
- bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * metadata, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
- } else {
- bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * data, we try to get the rest from metadata.
- */
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
- }
-
- /*
- * Re-sum ARC stats after the first round of evictions.
- */
- asize = aggsum_value(&arc_size);
- ameta = aggsum_value(&arc_meta_used);
-
- /*
- * Adjust MFU size
- *
- * Now that we've tried to evict enough from the MRU to get its
- * size back to arc_p, if we're still above the target cache
- * size, we evict the rest from the MFU.
- */
- target = asize - arc_c;
-
- if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
- ameta > arc_meta_min) {
- bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * metadata, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
- } else {
- bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * data, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
- }
-
- /*
- * Adjust ghost lists
- *
- * In addition to the above, the ARC also defines target values
- * for the ghost lists. The sum of the mru list and mru ghost
- * list should never exceed the target size of the cache, and
- * the sum of the mru list, mfu list, mru ghost list, and mfu
- * ghost list should never exceed twice the target size of the
- * cache. The following logic enforces these limits on the ghost
- * caches, and evicts from them as needed.
- */
- target = zfs_refcount_count(&arc_mru->arcs_size) +
- zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
-
- bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
-
- /*
- * We assume the sum of the mru list and mfu list is less than
- * or equal to arc_c (we enforced this above), which means we
- * can use the simpler of the two equations below:
- *
- * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
- * mru ghost + mfu ghost <= arc_c
- */
- target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
- zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
-
- bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- target -= bytes;
-
- total_evicted +=
- arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
-
- return (total_evicted);
-}
-
-void
-arc_flush(spa_t *spa, boolean_t retry)
-{
- uint64_t guid = 0;
-
- /*
- * If retry is B_TRUE, a spa must not be specified since we have
- * no good way to determine if all of a spa's buffers have been
- * evicted from an arc state.
- */
- ASSERT(!retry || spa == 0);
-
- if (spa != NULL)
- guid = spa_load_guid(spa);
-
- (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
- (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
-
- (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
- (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
-
- (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
- (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
-
- (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
- (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
-}
-
-static void
-arc_reduce_target_size(int64_t to_free)
-{
- uint64_t asize = aggsum_value(&arc_size);
- if (arc_c > arc_c_min) {
- DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
- arc_c_min, uint64_t, arc_p, uint64_t, to_free);
- if (arc_c > arc_c_min + to_free)
- atomic_add_64(&arc_c, -to_free);
- else
- arc_c = arc_c_min;
-
- atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
- if (asize < arc_c)
- arc_c = MAX(asize, arc_c_min);
- if (arc_p > arc_c)
- arc_p = (arc_c >> 1);
-
- DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
- arc_p);
-
- ASSERT(arc_c >= arc_c_min);
- ASSERT((int64_t)arc_p >= 0);
- }
-
- if (asize > arc_c) {
- DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
- uint64_t, arc_c);
- /* See comment in arc_adjust_cb_check() on why lock+flag */
- mutex_enter(&arc_adjust_lock);
- arc_adjust_needed = B_TRUE;
- mutex_exit(&arc_adjust_lock);
- zthr_wakeup(arc_adjust_zthr);
- }
-}
-
-typedef enum free_memory_reason_t {
- FMR_UNKNOWN,
- FMR_NEEDFREE,
- FMR_LOTSFREE,
- FMR_SWAPFS_MINFREE,
- FMR_PAGES_PP_MAXIMUM,
- FMR_HEAP_ARENA,
- FMR_ZIO_ARENA,
-} free_memory_reason_t;
-
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
-
-/*
- * Additional reserve of pages for pp_reserve.
- */
-int64_t arc_pages_pp_reserve = 64;
-
-/*
- * Additional reserve of pages for swapfs.
- */
-int64_t arc_swapfs_reserve = 64;
-
-/*
- * Return the amount of memory that can be consumed before reclaim will be
- * needed. Positive if there is sufficient free memory, negative indicates
- * the amount of memory that needs to be freed up.
- */
-static int64_t
-arc_available_memory(void)
-{
- int64_t lowest = INT64_MAX;
- int64_t n;
- free_memory_reason_t r = FMR_UNKNOWN;
-
-#ifdef _KERNEL
-#ifdef __FreeBSD__
- /*
- * Cooperate with pagedaemon when it's time for it to scan
- * and reclaim some pages.
- */
- n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
- if (n < lowest) {
- lowest = n;
- r = FMR_LOTSFREE;
- }
-
-#else
- if (needfree > 0) {
- n = PAGESIZE * (-needfree);
- if (n < lowest) {
- lowest = n;
- r = FMR_NEEDFREE;
- }
- }
-
- /*
- * check that we're out of range of the pageout scanner. It starts to
- * schedule paging if freemem is less than lotsfree and needfree.
- * lotsfree is the high-water mark for pageout, and needfree is the
- * number of needed free pages. We add extra pages here to make sure
- * the scanner doesn't start up while we're freeing memory.
- */
- n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
- if (n < lowest) {
- lowest = n;
- r = FMR_LOTSFREE;
- }
-
- /*
- * check to make sure that swapfs has enough space so that anon
- * reservations can still succeed. anon_resvmem() checks that the
- * availrmem is greater than swapfs_minfree, and the number of reserved
- * swap pages. We also add a bit of extra here just to prevent
- * circumstances from getting really dire.
- */
- n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
- desfree - arc_swapfs_reserve);
- if (n < lowest) {
- lowest = n;
- r = FMR_SWAPFS_MINFREE;
- }
-
-
- /*
- * Check that we have enough availrmem that memory locking (e.g., via
- * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
- * stores the number of pages that cannot be locked; when availrmem
- * drops below pages_pp_maximum, page locking mechanisms such as
- * page_pp_lock() will fail.)
- */
- n = PAGESIZE * (availrmem - pages_pp_maximum -
- arc_pages_pp_reserve);
- if (n < lowest) {
- lowest = n;
- r = FMR_PAGES_PP_MAXIMUM;
- }
-
-#endif /* __FreeBSD__ */
-#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
- /*
- * If we're on an i386 platform, it's possible that we'll exhaust the
- * kernel heap space before we ever run out of available physical
- * memory. Most checks of the size of the heap_area compare against
- * tune.t_minarmem, which is the minimum available real memory that we
- * can have in the system. However, this is generally fixed at 25 pages
- * which is so low that it's useless. In this comparison, we seek to
- * calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the calculation, if less than 1/4th is
- * free)
- */
- n = uma_avail() - (long)(uma_limit() / 4);
- if (n < lowest) {
- lowest = n;
- r = FMR_HEAP_ARENA;
- }
-#endif
-
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then enforce that the size of available vmem for this arena remains
- * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
- *
- * Note that reducing the arc_zio_arena_free_shift keeps more virtual
- * memory (in the zio_arena) free, which can avoid memory
- * fragmentation issues.
- */
- if (zio_arena != NULL) {
- n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
- (vmem_size(zio_arena, VMEM_ALLOC) >>
- arc_zio_arena_free_shift);
- if (n < lowest) {
- lowest = n;
- r = FMR_ZIO_ARENA;
- }
- }
-
-#else /* _KERNEL */
- /* Every 100 calls, free a small amount */
- if (spa_get_random(100) == 0)
- lowest = -1024;
-#endif /* _KERNEL */
-
- last_free_memory = lowest;
- last_free_reason = r;
- DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
- return (lowest);
-}
-
-
-/*
- * Determine if the system is under memory pressure and is asking
- * to reclaim memory. A return value of B_TRUE indicates that the system
- * is under memory pressure and that the arc should adjust accordingly.
- */
-static boolean_t
-arc_reclaim_needed(void)
-{
- return (arc_available_memory() < 0);
-}
-
-extern kmem_cache_t *zio_buf_cache[];
-extern kmem_cache_t *zio_data_buf_cache[];
-extern kmem_cache_t *range_seg_cache;
-extern kmem_cache_t *abd_chunk_cache;
-
-static __noinline void
-arc_kmem_reap_soon(void)
-{
- size_t i;
- kmem_cache_t *prev_cache = NULL;
- kmem_cache_t *prev_data_cache = NULL;
-
- DTRACE_PROBE(arc__kmem_reap_start);
-#ifdef _KERNEL
- if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
- /*
- * We are exceeding our meta-data cache limit.
- * Purge some DNLC entries to release holds on meta-data.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
- }
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
-#endif
-
- for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
- if (zio_buf_cache[i] != prev_cache) {
- prev_cache = zio_buf_cache[i];
- kmem_cache_reap_soon(zio_buf_cache[i]);
- }
- if (zio_data_buf_cache[i] != prev_data_cache) {
- prev_data_cache = zio_data_buf_cache[i];
- kmem_cache_reap_soon(zio_data_buf_cache[i]);
- }
- }
- kmem_cache_reap_soon(abd_chunk_cache);
- kmem_cache_reap_soon(buf_cache);
- kmem_cache_reap_soon(hdr_full_cache);
- kmem_cache_reap_soon(hdr_l2only_cache);
- kmem_cache_reap_soon(range_seg_cache);
-
-#ifdef illumos
- if (zio_arena != NULL) {
- /*
- * Ask the vmem arena to reclaim unused memory from its
- * quantum caches.
- */
- vmem_qcache_reap(zio_arena);
- }
-#endif
- DTRACE_PROBE(arc__kmem_reap_end);
-}
-
-/* ARGSUSED */
-static boolean_t
-arc_adjust_cb_check(void *arg, zthr_t *zthr)
-{
- /*
- * This is necessary in order for the mdb ::arc dcmd to
- * show up to date information. Since the ::arc command
- * does not call the kstat's update function, without
- * this call, the command may show stale stats for the
- * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
- * with this change, the data might be up to 1 second
- * out of date(the arc_adjust_zthr has a maximum sleep
- * time of 1 second); but that should suffice. The
- * arc_state_t structures can be queried directly if more
- * accurate information is needed.
- */
- if (arc_ksp != NULL)
- arc_ksp->ks_update(arc_ksp, KSTAT_READ);
-
- /*
- * We have to rely on arc_get_data_impl() to tell us when to adjust,
- * rather than checking if we are overflowing here, so that we are
- * sure to not leave arc_get_data_impl() waiting on
- * arc_adjust_waiters_cv. If we have become "not overflowing" since
- * arc_get_data_impl() checked, we need to wake it up. We could
- * broadcast the CV here, but arc_get_data_impl() may have not yet
- * gone to sleep. We would need to use a mutex to ensure that this
- * function doesn't broadcast until arc_get_data_impl() has gone to
- * sleep (e.g. the arc_adjust_lock). However, the lock ordering of
- * such a lock would necessarily be incorrect with respect to the
- * zthr_lock, which is held before this function is called, and is
- * held by arc_get_data_impl() when it calls zthr_wakeup().
- */
- return (arc_adjust_needed);
-}
-
-/*
- * Keep arc_size under arc_c by running arc_adjust which evicts data
- * from the ARC. */
-/* ARGSUSED */
-static void
-arc_adjust_cb(void *arg, zthr_t *zthr)
-{
- uint64_t evicted = 0;
-
- /* Evict from cache */
- evicted = arc_adjust();
-
- /*
- * If evicted is zero, we couldn't evict anything
- * via arc_adjust(). This could be due to hash lock
- * collisions, but more likely due to the majority of
- * arc buffers being unevictable. Therefore, even if
- * arc_size is above arc_c, another pass is unlikely to
- * be helpful and could potentially cause us to enter an
- * infinite loop. Additionally, zthr_iscancelled() is
- * checked here so that if the arc is shutting down, the
- * broadcast will wake any remaining arc adjust waiters.
- */
- mutex_enter(&arc_adjust_lock);
- arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
- evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
- if (!arc_adjust_needed) {
- /*
- * We're either no longer overflowing, or we
- * can't evict anything more, so we should wake
- * up any waiters.
- */
- cv_broadcast(&arc_adjust_waiters_cv);
- }
- mutex_exit(&arc_adjust_lock);
-}
-
-/* ARGSUSED */
-static boolean_t
-arc_reap_cb_check(void *arg, zthr_t *zthr)
-{
- int64_t free_memory = arc_available_memory();
-
- /*
- * If a kmem reap is already active, don't schedule more. We must
- * check for this because kmem_cache_reap_soon() won't actually
- * block on the cache being reaped (this is to prevent callers from
- * becoming implicitly blocked by a system-wide kmem reap -- which,
- * on a system with many, many full magazines, can take minutes).
- */
- if (!kmem_cache_reap_active() &&
- free_memory < 0) {
- arc_no_grow = B_TRUE;
- arc_warm = B_TRUE;
- /*
- * Wait at least zfs_grow_retry (default 60) seconds
- * before considering growing.
- */
- arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
- return (B_TRUE);
- } else if (free_memory < arc_c >> arc_no_grow_shift) {
- arc_no_grow = B_TRUE;
- } else if (gethrtime() >= arc_growtime) {
- arc_no_grow = B_FALSE;
- }
-
- return (B_FALSE);
-}
-
-/*
- * Keep enough free memory in the system by reaping the ARC's kmem
- * caches. To cause more slabs to be reapable, we may reduce the
- * target size of the cache (arc_c), causing the arc_adjust_cb()
- * to free more buffers.
- */
-/* ARGSUSED */
-static void
-arc_reap_cb(void *arg, zthr_t *zthr)
-{
- int64_t free_memory;
-
- /*
- * Kick off asynchronous kmem_reap()'s of all our caches.
- */
- arc_kmem_reap_soon();
-
- /*
- * Wait at least arc_kmem_cache_reap_retry_ms between
- * arc_kmem_reap_soon() calls. Without this check it is possible to
- * end up in a situation where we spend lots of time reaping
- * caches, while we're near arc_c_min. Waiting here also gives the
- * subsequent free memory check a chance of finding that the
- * asynchronous reap has already freed enough memory, and we don't
- * need to call arc_reduce_target_size().
- */
- delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
-
- /*
- * Reduce the target size as needed to maintain the amount of free
- * memory in the system at a fraction of the arc_size (1/128th by
- * default). If oversubscribed (free_memory < 0) then reduce the
- * target arc_size by the deficit amount plus the fractional
- * amount. If free memory is positive but less then the fractional
- * amount, reduce by what is needed to hit the fractional amount.
- */
- free_memory = arc_available_memory();
-
- int64_t to_free =
- (arc_c >> arc_shrink_shift) - free_memory;
- if (to_free > 0) {
-#ifdef _KERNEL
-#ifdef illumos
- to_free = MAX(to_free, ptob(needfree));
-#endif
-#endif
- arc_reduce_target_size(to_free);
- }
-}
-
-static u_int arc_dnlc_evicts_arg;
-extern struct vfsops zfs_vfsops;
-
-static void
-arc_dnlc_evicts_thread(void *dummy __unused)
-{
- callb_cpr_t cpr;
- u_int percent;
-
- CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
-
- mutex_enter(&arc_dnlc_evicts_lock);
- while (!arc_dnlc_evicts_thread_exit) {
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
- CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
- if (arc_dnlc_evicts_arg != 0) {
- percent = arc_dnlc_evicts_arg;
- mutex_exit(&arc_dnlc_evicts_lock);
-#ifdef _KERNEL
- vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
-#endif
- mutex_enter(&arc_dnlc_evicts_lock);
- /*
- * Clear our token only after vnlru_free()
- * pass is done, to avoid false queueing of
- * the requests.
- */
- arc_dnlc_evicts_arg = 0;
- }
- }
- arc_dnlc_evicts_thread_exit = FALSE;
- cv_broadcast(&arc_dnlc_evicts_cv);
- CALLB_CPR_EXIT(&cpr);
- thread_exit();
-}
-
-void
-dnlc_reduce_cache(void *arg)
-{
- u_int percent;
-
- percent = (u_int)(uintptr_t)arg;
- mutex_enter(&arc_dnlc_evicts_lock);
- if (arc_dnlc_evicts_arg == 0) {
- arc_dnlc_evicts_arg = percent;
- cv_broadcast(&arc_dnlc_evicts_cv);
- }
- mutex_exit(&arc_dnlc_evicts_lock);
-}
-
-/*
- * Adapt arc info given the number of bytes we are trying to add and
- * the state that we are comming from. This function is only called
- * when we are adding new content to the cache.
- */
-static void
-arc_adapt(int bytes, arc_state_t *state)
-{
- int mult;
- uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
- int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
- int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
-
- if (state == arc_l2c_only)
- return;
-
- ASSERT(bytes > 0);
- /*
- * Adapt the target size of the MRU list:
- * - if we just hit in the MRU ghost list, then increase
- * the target size of the MRU list.
- * - if we just hit in the MFU ghost list, then increase
- * the target size of the MFU list by decreasing the
- * target size of the MRU list.
- */
- if (state == arc_mru_ghost) {
- mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
-
- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
- } else if (state == arc_mfu_ghost) {
- uint64_t delta;
-
- mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
- mult = MIN(mult, 10);
-
- delta = MIN(bytes * mult, arc_p);
- arc_p = MAX(arc_p_min, arc_p - delta);
- }
- ASSERT((int64_t)arc_p >= 0);
-
- /*
- * Wake reap thread if we do not have any available memory
- */
- if (arc_reclaim_needed()) {
- zthr_wakeup(arc_reap_zthr);
- return;
- }
-
- if (arc_no_grow)
- return;
-
- if (arc_c >= arc_c_max)
- return;
-
- /*
- * If we're within (2 * maxblocksize) bytes of the target
- * cache size, increment the target cache size
- */
- if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
- 0) {
- DTRACE_PROBE1(arc__inc_adapt, int, bytes);
- atomic_add_64(&arc_c, (int64_t)bytes);
- if (arc_c > arc_c_max)
- arc_c = arc_c_max;
- else if (state == arc_anon)
- atomic_add_64(&arc_p, (int64_t)bytes);
- if (arc_p > arc_c)
- arc_p = arc_c;
- }
- ASSERT((int64_t)arc_p >= 0);
-}
-
-/*
- * Check if arc_size has grown past our upper threshold, determined by
- * zfs_arc_overflow_shift.
- */
-static boolean_t
-arc_is_overflowing(void)
-{
- /* Always allow at least one block of overflow */
- int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
- arc_c >> zfs_arc_overflow_shift);
-
- /*
- * We just compare the lower bound here for performance reasons. Our
- * primary goals are to make sure that the arc never grows without
- * bound, and that it can reach its maximum size. This check
- * accomplishes both goals. The maximum amount we could run over by is
- * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
- * in the ARC. In practice, that's in the tens of MB, which is low
- * enough to be safe.
- */
- return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
-}
-
-static abd_t *
-arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
-{
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- arc_get_data_impl(hdr, size, tag, do_adapt);
- if (type == ARC_BUFC_METADATA) {
- return (abd_alloc(size, B_TRUE));
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- return (abd_alloc(size, B_FALSE));
- }
-}
-
-static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
-{
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- arc_get_data_impl(hdr, size, tag, B_TRUE);
- if (type == ARC_BUFC_METADATA) {
- return (zio_buf_alloc(size));
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- return (zio_data_buf_alloc(size));
- }
-}
-
-/*
- * Allocate a block and return it to the caller. If we are hitting the
- * hard limit for the cache size, we must sleep, waiting for the eviction
- * thread to catch up. If we're past the target size but below the hard
- * limit, we'll only signal the reclaim thread and continue on.
- */
-static void
-arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
-{
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- if (do_adapt)
- arc_adapt(size, state);
-
- /*
- * If arc_size is currently overflowing, and has grown past our
- * upper limit, we must be adding data faster than the evict
- * thread can evict. Thus, to ensure we don't compound the
- * problem by adding more data and forcing arc_size to grow even
- * further past it's target size, we halt and wait for the
- * eviction thread to catch up.
- *
- * It's also possible that the reclaim thread is unable to evict
- * enough buffers to get arc_size below the overflow limit (e.g.
- * due to buffers being un-evictable, or hash lock collisions).
- * In this case, we want to proceed regardless if we're
- * overflowing; thus we don't use a while loop here.
- */
- if (arc_is_overflowing()) {
- mutex_enter(&arc_adjust_lock);
-
- /*
- * Now that we've acquired the lock, we may no longer be
- * over the overflow limit, lets check.
- *
- * We're ignoring the case of spurious wake ups. If that
- * were to happen, it'd let this thread consume an ARC
- * buffer before it should have (i.e. before we're under
- * the overflow limit and were signalled by the reclaim
- * thread). As long as that is a rare occurrence, it
- * shouldn't cause any harm.
- */
- if (arc_is_overflowing()) {
- arc_adjust_needed = B_TRUE;
- zthr_wakeup(arc_adjust_zthr);
- (void) cv_wait(&arc_adjust_waiters_cv,
- &arc_adjust_lock);
- }
- mutex_exit(&arc_adjust_lock);
- }
-
- VERIFY3U(hdr->b_type, ==, type);
- if (type == ARC_BUFC_METADATA) {
- arc_space_consume(size, ARC_SPACE_META);
- } else {
- arc_space_consume(size, ARC_SPACE_DATA);
- }
-
- /*
- * Update the state size. Note that ghost states have a
- * "ghost size" and so don't need to be updated.
- */
- if (!GHOST_STATE(state)) {
-
- (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
-
- /*
- * If this is reached via arc_read, the link is
- * protected by the hash lock. If reached via
- * arc_buf_alloc, the header should not be accessed by
- * any other thread. And, if reached via arc_read_done,
- * the hash lock will protect it if it's found in the
- * hash table; otherwise no other thread should be
- * trying to [add|remove]_reference it.
- */
- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- (void) zfs_refcount_add_many(&state->arcs_esize[type],
- size, tag);
- }
-
- /*
- * If we are growing the cache, and we are adding anonymous
- * data, and we have outgrown arc_p, update arc_p
- */
- if (aggsum_upper_bound(&arc_size) < arc_c &&
- hdr->b_l1hdr.b_state == arc_anon &&
- (zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
- arc_p = MIN(arc_c, arc_p + size);
- }
- ARCSTAT_BUMP(arcstat_allocated);
-}
-
-static void
-arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
-{
- arc_free_data_impl(hdr, size, tag);
- abd_free(abd);
-}
-
-static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
-{
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- arc_free_data_impl(hdr, size, tag);
- if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf, size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf, size);
- }
-}
-
-/*
- * Free the arc data buffer.
- */
-static void
-arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
-{
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- /* protected by hash lock, if in the hash table */
- if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT(state != arc_anon && state != arc_l2c_only);
-
- (void) zfs_refcount_remove_many(&state->arcs_esize[type],
- size, tag);
- }
- (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
-
- VERIFY3U(hdr->b_type, ==, type);
- if (type == ARC_BUFC_METADATA) {
- arc_space_return(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- arc_space_return(size, ARC_SPACE_DATA);
- }
-}
-
-/*
- * This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
- */
-static void
-arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
-{
- clock_t now;
-
- ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- if (hdr->b_l1hdr.b_state == arc_anon) {
- /*
- * This buffer is not in the cache, and does not
- * appear in our "ghost" list. Add the new buffer
- * to the MRU state.
- */
-
- ASSERT0(hdr->b_l1hdr.b_arc_access);
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mru, hdr, hash_lock);
-
- } else if (hdr->b_l1hdr.b_state == arc_mru) {
- now = ddi_get_lbolt();
-
- /*
- * If this buffer is here because of a prefetch, then either:
- * - clear the flag if this is a "referencing" read
- * (any subsequent access will bump this into the MFU state).
- * or
- * - move the buffer to the head of the list if this is
- * another prefetch (to make it less likely to be evicted).
- */
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
- /* link protected by hash lock */
- ASSERT(multilist_link_active(
- &hdr->b_l1hdr.b_arc_node));
- } else {
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREFETCH |
- ARC_FLAG_PRESCIENT_PREFETCH);
- ARCSTAT_BUMP(arcstat_mru_hits);
- }
- hdr->b_l1hdr.b_arc_access = now;
- return;
- }
-
- /*
- * This buffer has been "accessed" only once so far,
- * but it is still in the cache. Move it to the MFU
- * state.
- */
- if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
- /*
- * More than 125ms have passed since we
- * instantiated this buffer. Move it to the
- * most frequently used state.
- */
- hdr->b_l1hdr.b_arc_access = now;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mfu, hdr, hash_lock);
- }
- atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
- ARCSTAT_BUMP(arcstat_mru_hits);
- } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
- arc_state_t *new_state;
- /*
- * This buffer has been "accessed" recently, but
- * was evicted from the cache. Move it to the
- * MFU state.
- */
-
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
- new_state = arc_mru;
- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREFETCH |
- ARC_FLAG_PRESCIENT_PREFETCH);
- }
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
- } else {
- new_state = arc_mfu;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- }
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- arc_change_state(new_state, hdr, hash_lock);
-
- atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
- ARCSTAT_BUMP(arcstat_mru_ghost_hits);
- } else if (hdr->b_l1hdr.b_state == arc_mfu) {
- /*
- * This buffer has been accessed more than once and is
- * still in the cache. Keep it in the MFU state.
- *
- * NOTE: an add_reference() that occurred when we did
- * the arc_read() will have kicked this off the list.
- * If it was a prefetch, we will explicitly move it to
- * the head of the list now.
- */
-
- atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
- ARCSTAT_BUMP(arcstat_mfu_hits);
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
- arc_state_t *new_state = arc_mfu;
- /*
- * This buffer has been accessed more than once but has
- * been evicted from the cache. Move it back to the
- * MFU state.
- */
-
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
- /*
- * This is a prefetch access...
- * move this block back to the MRU state.
- */
- new_state = arc_mru;
- }
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(new_state, hdr, hash_lock);
-
- atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
- ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
- } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
- /*
- * This buffer is on the 2nd Level ARC.
- */
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mfu, hdr, hash_lock);
- } else {
- ASSERT(!"invalid arc state");
- }
-}
-
-/*
- * This routine is called by dbuf_hold() to update the arc_access() state
- * which otherwise would be skipped for entries in the dbuf cache.
- */
-void
-arc_buf_access(arc_buf_t *buf)
-{
- mutex_enter(&buf->b_evict_lock);
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- /*
- * Avoid taking the hash_lock when possible as an optimization.
- * The header must be checked again under the hash_lock in order
- * to handle the case where it is concurrently being released.
- */
- if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
- mutex_exit(&buf->b_evict_lock);
- ARCSTAT_BUMP(arcstat_access_skip);
- return;
- }
-
- kmutex_t *hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
-
- if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
- mutex_exit(hash_lock);
- mutex_exit(&buf->b_evict_lock);
- ARCSTAT_BUMP(arcstat_access_skip);
- return;
- }
-
- mutex_exit(&buf->b_evict_lock);
-
- ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
- hdr->b_l1hdr.b_state == arc_mfu);
-
- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
-
- ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
- demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
-}
-
-/* a generic arc_read_done_func_t which you can use */
-/* ARGSUSED */
-void
-arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
- arc_buf_t *buf, void *arg)
-{
- if (buf == NULL)
- return;
-
- bcopy(buf->b_data, arg, arc_buf_size(buf));
- arc_buf_destroy(buf, arg);
-}
-
-/* a generic arc_read_done_func_t */
-/* ARGSUSED */
-void
-arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
- arc_buf_t *buf, void *arg)
-{
- arc_buf_t **bufp = arg;
- if (buf == NULL) {
- ASSERT(zio == NULL || zio->io_error != 0);
- *bufp = NULL;
- } else {
- ASSERT(zio == NULL || zio->io_error == 0);
- *bufp = buf;
- ASSERT(buf->b_data != NULL);
- }
-}
-
-static void
-arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
-{
- if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
- ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- } else {
- if (HDR_COMPRESSION_ENABLED(hdr)) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
- BP_GET_COMPRESS(bp));
- }
- ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
- ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
- }
-}
-
-static void
-arc_read_done(zio_t *zio)
-{
- arc_buf_hdr_t *hdr = zio->io_private;
- kmutex_t *hash_lock = NULL;
- arc_callback_t *callback_list;
- arc_callback_t *acb;
- boolean_t freeable = B_FALSE;
- boolean_t no_zio_error = (zio->io_error == 0);
-
- /*
- * The hdr was inserted into hash-table and removed from lists
- * prior to starting I/O. We should find this header, since
- * it's in the hash table, and it should be legit since it's
- * not possible to evict it during the I/O. The only possible
- * reason for it not to be found is if we were freed during the
- * read.
- */
- if (HDR_IN_HASH_TABLE(hdr)) {
- ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
- ASSERT3U(hdr->b_dva.dva_word[0], ==,
- BP_IDENTITY(zio->io_bp)->dva_word[0]);
- ASSERT3U(hdr->b_dva.dva_word[1], ==,
- BP_IDENTITY(zio->io_bp)->dva_word[1]);
-
- arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
- &hash_lock);
-
- ASSERT((found == hdr &&
- DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
- (found == hdr && HDR_L2_READING(hdr)));
- ASSERT3P(hash_lock, !=, NULL);
- }
-
- if (no_zio_error) {
- /* byteswap if necessary */
- if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
- if (BP_GET_LEVEL(zio->io_bp) > 0) {
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
- } else {
- hdr->b_l1hdr.b_byteswap =
- DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
- }
- } else {
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- }
- }
-
- arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
- if (l2arc_noprefetch && HDR_PREFETCH(hdr))
- arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
-
- callback_list = hdr->b_l1hdr.b_acb;
- ASSERT3P(callback_list, !=, NULL);
-
- if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- arc_access(hdr, hash_lock);
- }
-
- /*
- * If a read request has a callback (i.e. acb_done is not NULL), then we
- * make a buf containing the data according to the parameters which were
- * passed in. The implementation of arc_buf_alloc_impl() ensures that we
- * aren't needlessly decompressing the data multiple times.
- */
- int callback_cnt = 0;
- for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
- if (!acb->acb_done)
- continue;
-
- callback_cnt++;
-
- if (no_zio_error) {
- int error = arc_buf_alloc_impl(hdr, acb->acb_private,
- acb->acb_compressed, zio->io_error == 0,
- &acb->acb_buf);
- if (error != 0) {
- /*
- * Decompression failed. Set io_error
- * so that when we call acb_done (below),
- * we will indicate that the read failed.
- * Note that in the unusual case where one
- * callback is compressed and another
- * uncompressed, we will mark all of them
- * as failed, even though the uncompressed
- * one can't actually fail. In this case,
- * the hdr will not be anonymous, because
- * if there are multiple callbacks, it's
- * because multiple threads found the same
- * arc buf in the hash table.
- */
- zio->io_error = error;
- }
- }
- }
- /*
- * If there are multiple callbacks, we must have the hash lock,
- * because the only way for multiple threads to find this hdr is
- * in the hash table. This ensures that if there are multiple
- * callbacks, the hdr is not anonymous. If it were anonymous,
- * we couldn't use arc_buf_destroy() in the error case below.
- */
- ASSERT(callback_cnt < 2 || hash_lock != NULL);
-
- hdr->b_l1hdr.b_acb = NULL;
- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (callback_cnt == 0) {
- ASSERT(HDR_PREFETCH(hdr));
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- }
-
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
- callback_list != NULL);
-
- if (no_zio_error) {
- arc_hdr_verify(hdr, zio->io_bp);
- } else {
- arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
- if (hdr->b_l1hdr.b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
- if (HDR_IN_HASH_TABLE(hdr))
- buf_hash_remove(hdr);
- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
- }
-
- /*
- * Broadcast before we drop the hash_lock to avoid the possibility
- * that the hdr (and hence the cv) might be freed before we get to
- * the cv_broadcast().
- */
- cv_broadcast(&hdr->b_l1hdr.b_cv);
-
- if (hash_lock != NULL) {
- mutex_exit(hash_lock);
- } else {
- /*
- * This block was freed while we waited for the read to
- * complete. It has been removed from the hash table and
- * moved to the anonymous state (so that it won't show up
- * in the cache).
- */
- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
- }
-
- /* execute each callback and free its structure */
- while ((acb = callback_list) != NULL) {
- if (acb->acb_done != NULL) {
- if (zio->io_error != 0 && acb->acb_buf != NULL) {
- /*
- * If arc_buf_alloc_impl() fails during
- * decompression, the buf will still be
- * allocated, and needs to be freed here.
- */
- arc_buf_destroy(acb->acb_buf, acb->acb_private);
- acb->acb_buf = NULL;
- }
- acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
- acb->acb_buf, acb->acb_private);
- }
-
- if (acb->acb_zio_dummy != NULL) {
- acb->acb_zio_dummy->io_error = zio->io_error;
- zio_nowait(acb->acb_zio_dummy);
- }
-
- callback_list = acb->acb_next;
- kmem_free(acb, sizeof (arc_callback_t));
- }
-
- if (freeable)
- arc_hdr_destroy(hdr);
-}
-
-/*
- * "Read" the block at the specified DVA (in bp) via the
- * cache. If the block is found in the cache, invoke the provided
- * callback immediately and return. Note that the `zio' parameter
- * in the callback will be NULL in this case, since no IO was
- * required. If the block is not in the cache pass the read request
- * on to the spa with a substitute callback function, so that the
- * requested block will be added to the cache.
- *
- * If a read request arrives for a block that has a read in-progress,
- * either wait for the in-progress read to complete (and return the
- * results); or, if this is a read with a "done" func, add a record
- * to the read to invoke the "done" func when the read completes,
- * and return; or just return.
- *
- * arc_read_done() will invoke all the requested "done" functions
- * for readers of this block.
- */
-int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
- void *private, zio_priority_t priority, int zio_flags,
- arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
-{
- arc_buf_hdr_t *hdr = NULL;
- kmutex_t *hash_lock = NULL;
- zio_t *rzio;
- uint64_t guid = spa_load_guid(spa);
- boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
- int rc = 0;
-
- ASSERT(!BP_IS_EMBEDDED(bp) ||
- BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
-
-top:
- if (!BP_IS_EMBEDDED(bp)) {
- /*
- * Embedded BP's have no DVA and require no I/O to "read".
- * Create an anonymous arc buf to back it.
- */
- hdr = buf_hash_find(guid, bp, &hash_lock);
- }
-
- if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
- arc_buf_t *buf = NULL;
- *arc_flags |= ARC_FLAG_CACHED;
-
- if (HDR_IO_IN_PROGRESS(hdr)) {
- zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
-
- ASSERT3P(head_zio, !=, NULL);
- if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
- priority == ZIO_PRIORITY_SYNC_READ) {
- /*
- * This is a sync read that needs to wait for
- * an in-flight async read. Request that the
- * zio have its priority upgraded.
- */
- zio_change_priority(head_zio, priority);
- DTRACE_PROBE1(arc__async__upgrade__sync,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_async_upgrade_sync);
- }
- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREDICTIVE_PREFETCH);
- }
-
- if (*arc_flags & ARC_FLAG_WAIT) {
- cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
- ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
-
- if (done) {
- arc_callback_t *acb = NULL;
-
- acb = kmem_zalloc(sizeof (arc_callback_t),
- KM_SLEEP);
- acb->acb_done = done;
- acb->acb_private = private;
- acb->acb_compressed = compressed_read;
- if (pio != NULL)
- acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, NULL, zio_flags);
-
- ASSERT3P(acb->acb_done, !=, NULL);
- acb->acb_zio_head = head_zio;
- acb->acb_next = hdr->b_l1hdr.b_acb;
- hdr->b_l1hdr.b_acb = acb;
- mutex_exit(hash_lock);
- return (0);
- }
- mutex_exit(hash_lock);
- return (0);
- }
-
- ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
- hdr->b_l1hdr.b_state == arc_mfu);
-
- if (done) {
- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
- /*
- * This is a demand read which does not have to
- * wait for i/o because we did a predictive
- * prefetch i/o for it, which has completed.
- */
- DTRACE_PROBE1(
- arc__demand__hit__predictive__prefetch,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(
- arcstat_demand_hit_predictive_prefetch);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREDICTIVE_PREFETCH);
- }
-
- if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
- ARCSTAT_BUMP(
- arcstat_demand_hit_prescient_prefetch);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PRESCIENT_PREFETCH);
- }
-
- ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
- /* Get a buf with the desired data in it. */
- rc = arc_buf_alloc_impl(hdr, private,
- compressed_read, B_TRUE, &buf);
- if (rc != 0) {
- arc_buf_destroy(buf, private);
- buf = NULL;
- }
- ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
- rc == 0 || rc != ENOENT);
- } else if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
- }
- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
- if (*arc_flags & ARC_FLAG_L2CACHE)
- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
- demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
- data, metadata, hits);
-
- if (done)
- done(NULL, zb, bp, buf, private);
- } else {
- uint64_t lsize = BP_GET_LSIZE(bp);
- uint64_t psize = BP_GET_PSIZE(bp);
- arc_callback_t *acb;
- vdev_t *vd = NULL;
- uint64_t addr = 0;
- boolean_t devw = B_FALSE;
- uint64_t size;
-
- if (hdr == NULL) {
- /* this block is not in the cache */
- arc_buf_hdr_t *exists = NULL;
- arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
- hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
- BP_GET_COMPRESS(bp), type);
-
- if (!BP_IS_EMBEDDED(bp)) {
- hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- exists = buf_hash_insert(hdr, &hash_lock);
- }
- if (exists != NULL) {
- /* somebody beat us to the hash insert */
- mutex_exit(hash_lock);
- buf_discard_identity(hdr);
- arc_hdr_destroy(hdr);
- goto top; /* restart the IO request */
- }
- } else {
- /*
- * This block is in the ghost cache. If it was L2-only
- * (and thus didn't have an L1 hdr), we realloc the
- * header to add an L1 hdr.
- */
- if (!HDR_HAS_L1HDR(hdr)) {
- hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
- hdr_full_cache);
- }
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
- /*
- * This is a delicate dance that we play here.
- * This hdr is in the ghost list so we access it
- * to move it out of the ghost list before we
- * initiate the read. If it's a prefetch then
- * it won't have a callback so we'll remove the
- * reference that arc_buf_alloc_impl() created. We
- * do this after we've called arc_access() to
- * avoid hitting an assert in remove_reference().
- */
- arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
- arc_access(hdr, hash_lock);
- arc_hdr_alloc_pabd(hdr, B_FALSE);
- }
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- size = arc_hdr_size(hdr);
-
- /*
- * If compression is enabled on the hdr, then will do
- * RAW I/O and will store the compressed data in the hdr's
- * data block. Otherwise, the hdr's data block will contain
- * the uncompressed data.
- */
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
- zio_flags |= ZIO_FLAG_RAW;
- }
-
- if (*arc_flags & ARC_FLAG_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
-
- if (*arc_flags & ARC_FLAG_L2CACHE)
- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
- if (BP_GET_LEVEL(bp) > 0)
- arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
- if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
- ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
-
- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
- acb->acb_done = done;
- acb->acb_private = private;
- acb->acb_compressed = compressed_read;
-
- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
- hdr->b_l1hdr.b_acb = acb;
- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
- if (HDR_HAS_L2HDR(hdr) &&
- (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
- devw = hdr->b_l2hdr.b_dev->l2ad_writing;
- addr = hdr->b_l2hdr.b_daddr;
- /*
- * Lock out L2ARC device removal.
- */
- if (vdev_is_dead(vd) ||
- !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
- vd = NULL;
- }
-
- /*
- * We count both async reads and scrub IOs as asynchronous so
- * that both can be upgraded in the event of a cache hit while
- * the read IO is still in-flight.
- */
- if (priority == ZIO_PRIORITY_ASYNC_READ ||
- priority == ZIO_PRIORITY_SCRUB)
- arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
- else
- arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
-
- /*
- * At this point, we have a level 1 cache miss. Try again in
- * L2ARC if possible.
- */
- ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
-
- DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
- uint64_t, lsize, zbookmark_phys_t *, zb);
- ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
- demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
- data, metadata, misses);
-#ifdef _KERNEL
-#ifdef RACCT
- if (racct_enable) {
- PROC_LOCK(curproc);
- racct_add_force(curproc, RACCT_READBPS, size);
- racct_add_force(curproc, RACCT_READIOPS, 1);
- PROC_UNLOCK(curproc);
- }
-#endif /* RACCT */
- curthread->td_ru.ru_inblock++;
-#endif
-
- if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
- /*
- * Read from the L2ARC if the following are true:
- * 1. The L2ARC vdev was previously cached.
- * 2. This buffer still has L2ARC metadata.
- * 3. This buffer isn't currently writing to the L2ARC.
- * 4. The L2ARC entry wasn't evicted, which may
- * also have invalidated the vdev.
- * 5. This isn't prefetch and l2arc_noprefetch is set.
- */
- if (HDR_HAS_L2HDR(hdr) &&
- !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
- !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
- l2arc_read_callback_t *cb;
- abd_t *abd;
- uint64_t asize;
-
- DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_l2_hits);
- atomic_inc_32(&hdr->b_l2hdr.b_hits);
-
- cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
- KM_SLEEP);
- cb->l2rcb_hdr = hdr;
- cb->l2rcb_bp = *bp;
- cb->l2rcb_zb = *zb;
- cb->l2rcb_flags = zio_flags;
-
- asize = vdev_psize_to_asize(vd, size);
- if (asize != size) {
- abd = abd_alloc_for_io(asize,
- HDR_ISTYPE_METADATA(hdr));
- cb->l2rcb_abd = abd;
- } else {
- abd = hdr->b_l1hdr.b_pabd;
- }
-
- ASSERT(addr >= VDEV_LABEL_START_SIZE &&
- addr + asize <= vd->vdev_psize -
- VDEV_LABEL_END_SIZE);
-
- /*
- * l2arc read. The SCL_L2ARC lock will be
- * released by l2arc_read_done().
- * Issue a null zio if the underlying buffer
- * was squashed to zero size by compression.
- */
- ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
- ZIO_COMPRESS_EMPTY);
- rzio = zio_read_phys(pio, vd, addr,
- asize, abd,
- ZIO_CHECKSUM_OFF,
- l2arc_read_done, cb, priority,
- zio_flags | ZIO_FLAG_DONT_CACHE |
- ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY, B_FALSE);
- acb->acb_zio_head = rzio;
-
- if (hash_lock != NULL)
- mutex_exit(hash_lock);
-
- DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
- zio_t *, rzio);
- ARCSTAT_INCR(arcstat_l2_read_bytes, size);
-
- if (*arc_flags & ARC_FLAG_NOWAIT) {
- zio_nowait(rzio);
- return (0);
- }
-
- ASSERT(*arc_flags & ARC_FLAG_WAIT);
- if (zio_wait(rzio) == 0)
- return (0);
-
- /* l2arc read error; goto zio_read() */
- if (hash_lock != NULL)
- mutex_enter(hash_lock);
- } else {
- DTRACE_PROBE1(l2arc__miss,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_l2_misses);
- if (HDR_L2_WRITING(hdr))
- ARCSTAT_BUMP(arcstat_l2_rw_clash);
- spa_config_exit(spa, SCL_L2ARC, vd);
- }
- } else {
- if (vd != NULL)
- spa_config_exit(spa, SCL_L2ARC, vd);
- if (l2arc_ndev != 0) {
- DTRACE_PROBE1(l2arc__miss,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_l2_misses);
- }
- }
-
- rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
- arc_read_done, hdr, priority, zio_flags, zb);
- acb->acb_zio_head = rzio;
-
- if (hash_lock != NULL)
- mutex_exit(hash_lock);
-
- if (*arc_flags & ARC_FLAG_WAIT)
- return (zio_wait(rzio));
-
- ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
- zio_nowait(rzio);
- }
- return (0);
-}
-
-arc_prune_t *
-arc_add_prune_callback(arc_prune_func_t *func, void *private)
-{
- arc_prune_t *p;
-
- p = kmem_alloc(sizeof (*p), KM_SLEEP);
- p->p_pfunc = func;
- p->p_private = private;
- list_link_init(&p->p_node);
- zfs_refcount_create(&p->p_refcnt);
-
- mutex_enter(&arc_prune_mtx);
- zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
- list_insert_head(&arc_prune_list, p);
- mutex_exit(&arc_prune_mtx);
-
- return (p);
-}
-
-void
-arc_remove_prune_callback(arc_prune_t *p)
-{
- boolean_t wait = B_FALSE;
- mutex_enter(&arc_prune_mtx);
- list_remove(&arc_prune_list, p);
- if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
- wait = B_TRUE;
- mutex_exit(&arc_prune_mtx);
-
- /* wait for arc_prune_task to finish */
- if (wait)
- taskq_wait(arc_prune_taskq);
- ASSERT0(zfs_refcount_count(&p->p_refcnt));
- zfs_refcount_destroy(&p->p_refcnt);
- kmem_free(p, sizeof (*p));
-}
-
-/*
- * Notify the arc that a block was freed, and thus will never be used again.
- */
-void
-arc_freed(spa_t *spa, const blkptr_t *bp)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
- uint64_t guid = spa_load_guid(spa);
-
- ASSERT(!BP_IS_EMBEDDED(bp));
-
- hdr = buf_hash_find(guid, bp, &hash_lock);
- if (hdr == NULL)
- return;
-
- /*
- * We might be trying to free a block that is still doing I/O
- * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
- * dmu_sync-ed block). If this block is being prefetched, then it
- * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
- * until the I/O completes. A block may also have a reference if it is
- * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
- * have written the new block to its final resting place on disk but
- * without the dedup flag set. This would have left the hdr in the MRU
- * state and discoverable. When the txg finally syncs it detects that
- * the block was overridden in open context and issues an override I/O.
- * Since this is a dedup block, the override I/O will determine if the
- * block is already in the DDT. If so, then it will replace the io_bp
- * with the bp from the DDT and allow the I/O to finish. When the I/O
- * reaches the done callback, dbuf_write_override_done, it will
- * check to see if the io_bp and io_bp_override are identical.
- * If they are not, then it indicates that the bp was replaced with
- * the bp in the DDT and the override bp is freed. This allows
- * us to arrive here with a reference on a block that is being
- * freed. So if we have an I/O in progress, or a reference to
- * this hdr, then we don't destroy the hdr.
- */
- if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
- arc_change_state(arc_anon, hdr, hash_lock);
- arc_hdr_destroy(hdr);
- mutex_exit(hash_lock);
- } else {
- mutex_exit(hash_lock);
- }
-
-}
-
-/*
- * Release this buffer from the cache, making it an anonymous buffer. This
- * must be done after a read and prior to modifying the buffer contents.
- * If the buffer has more than one reference, we must make
- * a new hdr for the buffer.
- */
-void
-arc_release(arc_buf_t *buf, void *tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- /*
- * It would be nice to assert that if it's DMU metadata (level >
- * 0 || it's the dnode file), then it must be syncing context.
- * But we don't know that information at this level.
- */
-
- mutex_enter(&buf->b_evict_lock);
-
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- /*
- * We don't grab the hash lock prior to this check, because if
- * the buffer's header is in the arc_anon state, it won't be
- * linked into the hash table.
- */
- if (hdr->b_l1hdr.b_state == arc_anon) {
- mutex_exit(&buf->b_evict_lock);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
- ASSERT(!HDR_HAS_L2HDR(hdr));
- ASSERT(HDR_EMPTY(hdr));
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-
- hdr->b_l1hdr.b_arc_access = 0;
-
- /*
- * If the buf is being overridden then it may already
- * have a hdr that is not empty.
- */
- buf_discard_identity(hdr);
- arc_buf_thaw(buf);
-
- return;
- }
-
- kmutex_t *hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
-
- /*
- * This assignment is only valid as long as the hash_lock is
- * held, we must be careful not to reference state or the
- * b_state field after dropping the lock.
- */
- arc_state_t *state = hdr->b_l1hdr.b_state;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3P(state, !=, arc_anon);
-
- /* this buffer is not on any list */
- ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
-
- if (HDR_HAS_L2HDR(hdr)) {
- mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-
- /*
- * We have to recheck this conditional again now that
- * we're holding the l2ad_mtx to prevent a race with
- * another thread which might be concurrently calling
- * l2arc_evict(). In that case, l2arc_evict() might have
- * destroyed the header's L2 portion as we were waiting
- * to acquire the l2ad_mtx.
- */
- if (HDR_HAS_L2HDR(hdr)) {
- l2arc_trim(hdr);
- arc_hdr_l2hdr_destroy(hdr);
- }
-
- mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
- }
-
- /*
- * Do we have more than one buf?
- */
- if (hdr->b_l1hdr.b_bufcnt > 1) {
- arc_buf_hdr_t *nhdr;
- uint64_t spa = hdr->b_spa;
- uint64_t psize = HDR_GET_PSIZE(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
- enum zio_compress compress = HDR_GET_COMPRESS(hdr);
- arc_buf_contents_t type = arc_buf_type(hdr);
- VERIFY3U(hdr->b_type, ==, type);
-
- ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
- (void) remove_reference(hdr, hash_lock, tag);
-
- if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
- ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
- ASSERT(ARC_BUF_LAST(buf));
- }
-
- /*
- * Pull the data off of this hdr and attach it to
- * a new anonymous hdr. Also find the last buffer
- * in the hdr's buffer list.
- */
- arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
- ASSERT3P(lastbuf, !=, NULL);
-
- /*
- * If the current arc_buf_t and the hdr are sharing their data
- * buffer, then we must stop sharing that block.
- */
- if (arc_buf_is_shared(buf)) {
- VERIFY(!arc_buf_is_shared(lastbuf));
-
- /*
- * First, sever the block sharing relationship between
- * buf and the arc_buf_hdr_t.
- */
- arc_unshare_buf(hdr, buf);
-
- /*
- * Now we need to recreate the hdr's b_pabd. Since we
- * have lastbuf handy, we try to share with it, but if
- * we can't then we allocate a new b_pabd and copy the
- * data from buf into it.
- */
- if (arc_can_share(hdr, lastbuf)) {
- arc_share_buf(hdr, lastbuf);
- } else {
- arc_hdr_alloc_pabd(hdr, B_TRUE);
- abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
- buf->b_data, psize);
- }
- VERIFY3P(lastbuf->b_data, !=, NULL);
- } else if (HDR_SHARED_DATA(hdr)) {
- /*
- * Uncompressed shared buffers are always at the end
- * of the list. Compressed buffers don't have the
- * same requirements. This makes it hard to
- * simply assert that the lastbuf is shared so
- * we rely on the hdr's compression flags to determine
- * if we have a compressed, shared buffer.
- */
- ASSERT(arc_buf_is_shared(lastbuf) ||
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
- ASSERT(!ARC_BUF_SHARED(buf));
- }
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- ASSERT3P(state, !=, arc_l2c_only);
-
- (void) zfs_refcount_remove_many(&state->arcs_size,
- arc_buf_size(buf), buf);
-
- if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
- ASSERT3P(state, !=, arc_l2c_only);
- (void) zfs_refcount_remove_many(
- &state->arcs_esize[type],
- arc_buf_size(buf), buf);
- }
-
- hdr->b_l1hdr.b_bufcnt -= 1;
- arc_cksum_verify(buf);
-#ifdef illumos
- arc_buf_unwatch(buf);
-#endif
-
- mutex_exit(hash_lock);
-
- /*
- * Allocate a new hdr. The new hdr will contain a b_pabd
- * buffer which will be freed in arc_write().
- */
- nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
- ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(nhdr->b_l1hdr.b_bufcnt);
- ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
- VERIFY3U(nhdr->b_type, ==, type);
- ASSERT(!HDR_SHARED_DATA(nhdr));
-
- nhdr->b_l1hdr.b_buf = buf;
- nhdr->b_l1hdr.b_bufcnt = 1;
- (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
- buf->b_hdr = nhdr;
-
- mutex_exit(&buf->b_evict_lock);
- (void) zfs_refcount_add_many(&arc_anon->arcs_size,
- arc_buf_size(buf), buf);
- } else {
- mutex_exit(&buf->b_evict_lock);
- ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
- /* protected by hash lock, or hdr is on arc_anon */
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
- hdr->b_l1hdr.b_arc_access = 0;
- mutex_exit(hash_lock);
-
- buf_discard_identity(hdr);
- arc_buf_thaw(buf);
- }
-}
-
-int
-arc_released(arc_buf_t *buf)
-{
- int released;
-
- mutex_enter(&buf->b_evict_lock);
- released = (buf->b_data != NULL &&
- buf->b_hdr->b_l1hdr.b_state == arc_anon);
- mutex_exit(&buf->b_evict_lock);
- return (released);
-}
-
-#ifdef ZFS_DEBUG
-int
-arc_referenced(arc_buf_t *buf)
-{
- int referenced;
-
- mutex_enter(&buf->b_evict_lock);
- referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
- mutex_exit(&buf->b_evict_lock);
- return (referenced);
-}
-#endif
-
-static void
-arc_write_ready(zio_t *zio)
-{
- arc_write_callback_t *callback = zio->io_private;
- arc_buf_t *buf = callback->awcb_buf;
- arc_buf_hdr_t *hdr = buf->b_hdr;
- uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
-
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-
- /*
- * If we're reexecuting this zio because the pool suspended, then
- * cleanup any state that was previously set the first time the
- * callback was invoked.
- */
- if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
- arc_cksum_free(hdr);
-#ifdef illumos
- arc_buf_unwatch(buf);
-#endif
- if (hdr->b_l1hdr.b_pabd != NULL) {
- if (arc_buf_is_shared(buf)) {
- arc_unshare_buf(hdr, buf);
- } else {
- arc_hdr_free_pabd(hdr);
- }
- }
- }
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT(!arc_buf_is_shared(buf));
-
- callback->awcb_ready(zio, buf, callback->awcb_private);
-
- if (HDR_IO_IN_PROGRESS(hdr))
- ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
-
- arc_cksum_compute(buf);
- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
- enum zio_compress compress;
- if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
- compress = ZIO_COMPRESS_OFF;
- } else {
- ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
- compress = BP_GET_COMPRESS(zio->io_bp);
- }
- HDR_SET_PSIZE(hdr, psize);
- arc_hdr_set_compress(hdr, compress);
-
-
- /*
- * Fill the hdr with data. If the hdr is compressed, the data we want
- * is available from the zio, otherwise we can take it from the buf.
- *
- * We might be able to share the buf's data with the hdr here. However,
- * doing so would cause the ARC to be full of linear ABDs if we write a
- * lot of shareable data. As a compromise, we check whether scattered
- * ABDs are allowed, and assume that if they are then the user wants
- * the ARC to be primarily filled with them regardless of the data being
- * written. Therefore, if they're allowed then we allocate one and copy
- * the data into it; otherwise, we share the data directly if we can.
- */
- if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
- arc_hdr_alloc_pabd(hdr, B_TRUE);
-
- /*
- * Ideally, we would always copy the io_abd into b_pabd, but the
- * user may have disabled compressed ARC, thus we must check the
- * hdr's compression setting rather than the io_bp's.
- */
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
- ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
- ZIO_COMPRESS_OFF);
- ASSERT3U(psize, >, 0);
-
- abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
- } else {
- ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
-
- abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
- arc_buf_size(buf));
- }
- } else {
- ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
- ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
-
- arc_share_buf(hdr, buf);
- }
-
- arc_hdr_verify(hdr, zio->io_bp);
-}
-
-static void
-arc_write_children_ready(zio_t *zio)
-{
- arc_write_callback_t *callback = zio->io_private;
- arc_buf_t *buf = callback->awcb_buf;
-
- callback->awcb_children_ready(zio, buf, callback->awcb_private);
-}
-
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write. See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
- arc_write_callback_t *cb = zio->io_private;
- if (cb->awcb_physdone != NULL)
- cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
-static void
-arc_write_done(zio_t *zio)
-{
- arc_write_callback_t *callback = zio->io_private;
- arc_buf_t *buf = callback->awcb_buf;
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-
- if (zio->io_error == 0) {
- arc_hdr_verify(hdr, zio->io_bp);
-
- if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
- buf_discard_identity(hdr);
- } else {
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
- }
- } else {
- ASSERT(HDR_EMPTY(hdr));
- }
-
- /*
- * If the block to be written was all-zero or compressed enough to be
- * embedded in the BP, no write was performed so there will be no
- * dva/birth/checksum. The buffer must therefore remain anonymous
- * (and uncached).
- */
- if (!HDR_EMPTY(hdr)) {
- arc_buf_hdr_t *exists;
- kmutex_t *hash_lock;
-
- ASSERT3U(zio->io_error, ==, 0);
-
- arc_cksum_verify(buf);
-
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists != NULL) {
- /*
- * This can only happen if we overwrite for
- * sync-to-convergence, because we remove
- * buffers from the hash table when we arc_free().
- */
- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
- panic("bad overwrite, hdr=%p exists=%p",
- (void *)hdr, (void *)exists);
- ASSERT(zfs_refcount_is_zero(
- &exists->b_l1hdr.b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
- } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
- /* nopwrite */
- ASSERT(zio->io_prop.zp_nopwrite);
- if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
- panic("bad nopwrite, hdr=%p exists=%p",
- (void *)hdr, (void *)exists);
- } else {
- /* Dedup */
- ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
- ASSERT(hdr->b_l1hdr.b_state == arc_anon);
- ASSERT(BP_GET_DEDUP(zio->io_bp));
- ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
- }
- }
- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- /* if it's not anon, we are doing a scrub */
- if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- } else {
- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- }
-
- ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
-
- abd_put(zio->io_abd);
- kmem_free(callback, sizeof (arc_write_callback_t));
-}
-
-zio_t *
-arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
- arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
- arc_write_done_func_t *done, void *private, zio_priority_t priority,
- int zio_flags, const zbookmark_phys_t *zb)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- arc_write_callback_t *callback;
- zio_t *zio;
- zio_prop_t localprop = *zp;
-
- ASSERT3P(ready, !=, NULL);
- ASSERT3P(done, !=, NULL);
- ASSERT(!HDR_IO_ERROR(hdr));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
- if (l2arc)
- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
- if (ARC_BUF_COMPRESSED(buf)) {
- /*
- * We're writing a pre-compressed buffer. Make the
- * compression algorithm requested by the zio_prop_t match
- * the pre-compressed buffer's compression algorithm.
- */
- localprop.zp_compress = HDR_GET_COMPRESS(hdr);
-
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
- zio_flags |= ZIO_FLAG_RAW;
- }
- callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
- callback->awcb_ready = ready;
- callback->awcb_children_ready = children_ready;
- callback->awcb_physdone = physdone;
- callback->awcb_done = done;
- callback->awcb_private = private;
- callback->awcb_buf = buf;
-
- /*
- * The hdr's b_pabd is now stale, free it now. A new data block
- * will be allocated when the zio pipeline calls arc_write_ready().
- */
- if (hdr->b_l1hdr.b_pabd != NULL) {
- /*
- * If the buf is currently sharing the data block with
- * the hdr then we need to break that relationship here.
- * The hdr will remain with a NULL data pointer and the
- * buf will take sole ownership of the block.
- */
- if (arc_buf_is_shared(buf)) {
- arc_unshare_buf(hdr, buf);
- } else {
- arc_hdr_free_pabd(hdr);
- }
- VERIFY3P(buf->b_data, !=, NULL);
- arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
- }
- ASSERT(!arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
- zio = zio_write(pio, spa, txg, bp,
- abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
- HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
- (children_ready != NULL) ? arc_write_children_ready : NULL,
- arc_write_physdone, arc_write_done, callback,
- priority, zio_flags, zb);
-
- return (zio);
-}
-
-static int
-arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
-{
-#ifdef _KERNEL
- uint64_t available_memory = ptob(freemem);
-
-#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
- available_memory = MIN(available_memory, uma_avail());
-#endif
-
- if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
- return (0);
-
- if (txg > spa->spa_lowmem_last_txg) {
- spa->spa_lowmem_last_txg = txg;
- spa->spa_lowmem_page_load = 0;
- }
- /*
- * If we are in pageout, we know that memory is already tight,
- * the arc is already going to be evicting, so we just want to
- * continue to let page writes occur as quickly as possible.
- */
- if (curproc == pageproc) {
- if (spa->spa_lowmem_page_load >
- MAX(ptob(minfree), available_memory) / 4)
- return (SET_ERROR(ERESTART));
- /* Note: reserve is inflated, so we deflate */
- atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
- return (0);
- } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
- /* memory is low, delay before restarting */
- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
- return (SET_ERROR(EAGAIN));
- }
- spa->spa_lowmem_page_load = 0;
-#endif /* _KERNEL */
- return (0);
-}
-
-void
-arc_tempreserve_clear(uint64_t reserve)
-{
- atomic_add_64(&arc_tempreserve, -reserve);
- ASSERT((int64_t)arc_tempreserve >= 0);
-}
-
-int
-arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
-{
- int error;
- uint64_t anon_size;
-
- if (reserve > arc_c/4 && !arc_no_grow) {
- arc_c = MIN(arc_c_max, reserve * 4);
- DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
- }
- if (reserve > arc_c)
- return (SET_ERROR(ENOMEM));
-
- /*
- * Don't count loaned bufs as in flight dirty data to prevent long
- * network delays from blocking transactions that are ready to be
- * assigned to a txg.
- */
-
- /* assert that it has not wrapped around */
- ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
-
- anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
- arc_loaned_bytes), 0);
-
- /*
- * Writes will, almost always, require additional memory allocations
- * in order to compress/encrypt/etc the data. We therefore need to
- * make sure that there is sufficient available memory for this.
- */
- error = arc_memory_throttle(spa, reserve, txg);
- if (error != 0)
- return (error);
-
- /*
- * Throttle writes when the amount of dirty data in the cache
- * gets too large. We try to keep the cache less than half full
- * of dirty blocks so that our sync times don't grow too large.
- *
- * In the case of one pool being built on another pool, we want
- * to make sure we don't end up throttling the lower (backing)
- * pool when the upper pool is the majority contributor to dirty
- * data. To insure we make forward progress during throttling, we
- * also check the current pool's net dirty data and only throttle
- * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
- * data in the cache.
- *
- * Note: if two requests come in concurrently, we might let them
- * both succeed, when one of them should fail. Not a huge deal.
- */
- uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
- uint64_t spa_dirty_anon = spa_dirty_data(spa);
-
- if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
- anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
- spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
- uint64_t meta_esize =
- zfs_refcount_count(
- &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
- uint64_t data_esize =
- zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
- dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
- "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve >> 10, meta_esize >> 10,
- data_esize >> 10, reserve >> 10, arc_c >> 10);
- return (SET_ERROR(ERESTART));
- }
- atomic_add_64(&arc_tempreserve, reserve);
- return (0);
-}
-
-static void
-arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
- kstat_named_t *evict_data, kstat_named_t *evict_metadata)
-{
- size->value.ui64 = zfs_refcount_count(&state->arcs_size);
- evict_data->value.ui64 =
- zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
- evict_metadata->value.ui64 =
- zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
-}
-
-static int
-arc_kstat_update(kstat_t *ksp, int rw)
-{
- arc_stats_t *as = ksp->ks_data;
-
- if (rw == KSTAT_WRITE) {
- return (EACCES);
- } else {
- arc_kstat_update_state(arc_anon,
- &as->arcstat_anon_size,
- &as->arcstat_anon_evictable_data,
- &as->arcstat_anon_evictable_metadata);
- arc_kstat_update_state(arc_mru,
- &as->arcstat_mru_size,
- &as->arcstat_mru_evictable_data,
- &as->arcstat_mru_evictable_metadata);
- arc_kstat_update_state(arc_mru_ghost,
- &as->arcstat_mru_ghost_size,
- &as->arcstat_mru_ghost_evictable_data,
- &as->arcstat_mru_ghost_evictable_metadata);
- arc_kstat_update_state(arc_mfu,
- &as->arcstat_mfu_size,
- &as->arcstat_mfu_evictable_data,
- &as->arcstat_mfu_evictable_metadata);
- arc_kstat_update_state(arc_mfu_ghost,
- &as->arcstat_mfu_ghost_size,
- &as->arcstat_mfu_ghost_evictable_data,
- &as->arcstat_mfu_ghost_evictable_metadata);
-
- ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
- ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
- ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
- ARCSTAT(arcstat_metadata_size) =
- aggsum_value(&astat_metadata_size);
- ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
- ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
- ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
- ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
- ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
- aggsum_value(&astat_dnode_size) +
- aggsum_value(&astat_dbuf_size);
-#endif
- ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
- }
-
- return (0);
-}
-
-/*
- * This function *must* return indices evenly distributed between all
- * sublists of the multilist. This is needed due to how the ARC eviction
- * code is laid out; arc_evict_state() assumes ARC buffers are evenly
- * distributed between all sublists and uses this assumption when
- * deciding which sublist to evict from and how much to evict from it.
- */
-unsigned int
-arc_state_multilist_index_func(multilist_t *ml, void *obj)
-{
- arc_buf_hdr_t *hdr = obj;
-
- /*
- * We rely on b_dva to generate evenly distributed index
- * numbers using buf_hash below. So, as an added precaution,
- * let's make sure we never add empty buffers to the arc lists.
- */
- ASSERT(!HDR_EMPTY(hdr));
-
- /*
- * The assumption here, is the hash value for a given
- * arc_buf_hdr_t will remain constant throughout it's lifetime
- * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
- * Thus, we don't need to store the header's sublist index
- * on insertion, as this index can be recalculated on removal.
- *
- * Also, the low order bits of the hash value are thought to be
- * distributed evenly. Otherwise, in the case that the multilist
- * has a power of two number of sublists, each sublists' usage
- * would not be evenly distributed.
- */
- return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
- multilist_get_num_sublists(ml));
-}
-
-#ifdef _KERNEL
-static eventhandler_tag arc_event_lowmem = NULL;
-
-static void
-arc_lowmem(void *arg __unused, int howto __unused)
-{
- int64_t free_memory, to_free;
-
- arc_no_grow = B_TRUE;
- arc_warm = B_TRUE;
- arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
- free_memory = arc_available_memory();
- to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
- DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
- arc_reduce_target_size(to_free);
-
- mutex_enter(&arc_adjust_lock);
- arc_adjust_needed = B_TRUE;
- zthr_wakeup(arc_adjust_zthr);
-
- /*
- * It is unsafe to block here in arbitrary threads, because we can come
- * here from ARC itself and may hold ARC locks and thus risk a deadlock
- * with ARC reclaim thread.
- */
- if (curproc == pageproc)
- (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
- mutex_exit(&arc_adjust_lock);
-}
-#endif
-
-static void
-arc_state_init(void)
-{
- arc_anon = &ARC_anon;
- arc_mru = &ARC_mru;
- arc_mru_ghost = &ARC_mru_ghost;
- arc_mfu = &ARC_mfu;
- arc_mfu_ghost = &ARC_mfu_ghost;
- arc_l2c_only = &ARC_l2c_only;
-
- arc_mru->arcs_list[ARC_BUFC_METADATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mru->arcs_list[ARC_BUFC_DATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mfu->arcs_list[ARC_BUFC_METADATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mfu->arcs_list[ARC_BUFC_DATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
- multilist_create(sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
-
- zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
- zfs_refcount_create(&arc_anon->arcs_size);
- zfs_refcount_create(&arc_mru->arcs_size);
- zfs_refcount_create(&arc_mru_ghost->arcs_size);
- zfs_refcount_create(&arc_mfu->arcs_size);
- zfs_refcount_create(&arc_mfu_ghost->arcs_size);
- zfs_refcount_create(&arc_l2c_only->arcs_size);
-
- aggsum_init(&arc_meta_used, 0);
- aggsum_init(&arc_size, 0);
- aggsum_init(&astat_data_size, 0);
- aggsum_init(&astat_metadata_size, 0);
- aggsum_init(&astat_hdr_size, 0);
- aggsum_init(&astat_bonus_size, 0);
- aggsum_init(&astat_dnode_size, 0);
- aggsum_init(&astat_dbuf_size, 0);
- aggsum_init(&astat_l2_hdr_size, 0);
-}
-
-static void
-arc_state_fini(void)
-{
- zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
- zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
- zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
- zfs_refcount_destroy(&arc_anon->arcs_size);
- zfs_refcount_destroy(&arc_mru->arcs_size);
- zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
- zfs_refcount_destroy(&arc_mfu->arcs_size);
- zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
- zfs_refcount_destroy(&arc_l2c_only->arcs_size);
-
- multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
- multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
- multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
- multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
- multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
- multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
- multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
- multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
- aggsum_fini(&arc_meta_used);
- aggsum_fini(&arc_size);
- aggsum_fini(&astat_data_size);
- aggsum_fini(&astat_metadata_size);
- aggsum_fini(&astat_hdr_size);
- aggsum_fini(&astat_bonus_size);
- aggsum_fini(&astat_dnode_size);
- aggsum_fini(&astat_dbuf_size);
- aggsum_fini(&astat_l2_hdr_size);
-}
-
-uint64_t
-arc_max_bytes(void)
-{
- return (arc_c_max);
-}
-
-void
-arc_init(void)
-{
- int i, prefetch_tunable_set = 0;
-
- /*
- * allmem is "all memory that we could possibly use".
- */
-#ifdef illumos
-#ifdef _KERNEL
- uint64_t allmem = ptob(physmem - swapfs_minfree);
-#else
- uint64_t allmem = (physmem * PAGESIZE) / 2;
-#endif
-#else
- uint64_t allmem = kmem_size();
-#endif
- mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
-
- mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
-
- /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
- arc_c_min = MAX(allmem / 32, arc_abs_min);
- /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
- if (allmem >= 1 << 30)
- arc_c_max = allmem - (1 << 30);
- else
- arc_c_max = arc_c_min;
- arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
-
- /*
- * In userland, there's only the memory pressure that we artificially
- * create (see arc_available_memory()). Don't let arc_c get too
- * small, because it can cause transactions to be larger than
- * arc_c, causing arc_tempreserve_space() to fail.
- */
-#ifndef _KERNEL
- arc_c_min = arc_c_max / 2;
-#endif
-
-#ifdef _KERNEL
- /*
- * Allow the tunables to override our calculations if they are
- * reasonable.
- */
- if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
- arc_c_max = zfs_arc_max;
- arc_c_min = MIN(arc_c_min, arc_c_max);
- }
- if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
- arc_c_min = zfs_arc_min;
-#endif
-
- arc_c = arc_c_max;
- arc_p = (arc_c >> 1);
-
- /* limit meta-data to 1/4 of the arc capacity */
- arc_meta_limit = arc_c_max / 4;
-
-#ifdef _KERNEL
- /*
- * Metadata is stored in the kernel's heap. Don't let us
- * use more than half the heap for the ARC.
- */
-#ifdef __FreeBSD__
- arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
- arc_dnode_limit = arc_meta_limit / 10;
-#else
- arc_meta_limit = MIN(arc_meta_limit,
- vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
-#endif
-#endif
-
- /* Allow the tunable to override if it is reasonable */
- if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
- arc_meta_limit = zfs_arc_meta_limit;
-
- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
- arc_c_min = arc_meta_limit / 2;
-
- if (zfs_arc_meta_min > 0) {
- arc_meta_min = zfs_arc_meta_min;
- } else {
- arc_meta_min = arc_c_min / 2;
- }
-
- /* Valid range: <arc_meta_min> - <arc_c_max> */
- if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
- (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
- (zfs_arc_dnode_limit <= arc_c_max))
- arc_dnode_limit = zfs_arc_dnode_limit;
-
- if (zfs_arc_grow_retry > 0)
- arc_grow_retry = zfs_arc_grow_retry;
-
- if (zfs_arc_shrink_shift > 0)
- arc_shrink_shift = zfs_arc_shrink_shift;
-
- if (zfs_arc_no_grow_shift > 0)
- arc_no_grow_shift = zfs_arc_no_grow_shift;
- /*
- * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
- */
- if (arc_no_grow_shift >= arc_shrink_shift)
- arc_no_grow_shift = arc_shrink_shift - 1;
-
- if (zfs_arc_p_min_shift > 0)
- arc_p_min_shift = zfs_arc_p_min_shift;
-
- /* if kmem_flags are set, lets try to use less memory */
- if (kmem_debugging())
- arc_c = arc_c / 2;
- if (arc_c < arc_c_min)
- arc_c = arc_c_min;
-
- zfs_arc_min = arc_c_min;
- zfs_arc_max = arc_c_max;
-
- arc_state_init();
-
- /*
- * The arc must be "uninitialized", so that hdr_recl() (which is
- * registered by buf_init()) will not access arc_reap_zthr before
- * it is created.
- */
- ASSERT(!arc_initialized);
- buf_init();
-
- list_create(&arc_prune_list, sizeof (arc_prune_t),
- offsetof(arc_prune_t, p_node));
- mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
- max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-
- arc_dnlc_evicts_thread_exit = FALSE;
-
- arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
- sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
-
- if (arc_ksp != NULL) {
- arc_ksp->ks_data = &arc_stats;
- arc_ksp->ks_update = arc_kstat_update;
- kstat_install(arc_ksp);
- }
-
- arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check,
- arc_adjust_cb, NULL, SEC2NSEC(1));
- arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
- arc_reap_cb, NULL, SEC2NSEC(1));
-
-#ifdef _KERNEL
- arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
- EVENTHANDLER_PRI_FIRST);
-#endif
-
- (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
- TS_RUN, minclsyspri);
-
- arc_initialized = B_TRUE;
- arc_warm = B_FALSE;
-
- /*
- * Calculate maximum amount of dirty data per pool.
- *
- * If it has been set by /etc/system, take that.
- * Otherwise, use a percentage of physical memory defined by
- * zfs_dirty_data_max_percent (default 10%) with a cap at
- * zfs_dirty_data_max_max (default 4GB).
- */
- if (zfs_dirty_data_max == 0) {
- zfs_dirty_data_max = ptob(physmem) *
- zfs_dirty_data_max_percent / 100;
- zfs_dirty_data_max = MIN(zfs_dirty_data_max,
- zfs_dirty_data_max_max);
- }
-
-#ifdef _KERNEL
- if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
- prefetch_tunable_set = 1;
-
-#ifdef __i386__
- if (prefetch_tunable_set == 0) {
- printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
- "-- to enable,\n");
- printf(" add \"vfs.zfs.prefetch_disable=0\" "
- "to /boot/loader.conf.\n");
- zfs_prefetch_disable = 1;
- }
-#else
- if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
- prefetch_tunable_set == 0) {
- printf("ZFS NOTICE: Prefetch is disabled by default if less "
- "than 4GB of RAM is present;\n"
- " to enable, add \"vfs.zfs.prefetch_disable=0\" "
- "to /boot/loader.conf.\n");
- zfs_prefetch_disable = 1;
- }
-#endif
- /* Warn about ZFS memory and address space requirements. */
- if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
- printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
- "expect unstable behavior.\n");
- }
- if (allmem < 512 * (1 << 20)) {
- printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
- "expect unstable behavior.\n");
- printf(" Consider tuning vm.kmem_size and "
- "vm.kmem_size_max\n");
- printf(" in /boot/loader.conf.\n");
- }
-#endif
-}
-
-void
-arc_fini(void)
-{
- arc_prune_t *p;
-
-#ifdef _KERNEL
- if (arc_event_lowmem != NULL)
- EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
-#endif
-
- /* Use B_TRUE to ensure *all* buffers are evicted */
- arc_flush(NULL, B_TRUE);
-
- mutex_enter(&arc_dnlc_evicts_lock);
- arc_dnlc_evicts_thread_exit = TRUE;
- /*
- * The user evicts thread will set arc_user_evicts_thread_exit
- * to FALSE when it is finished exiting; we're waiting for that.
- */
- while (arc_dnlc_evicts_thread_exit) {
- cv_signal(&arc_dnlc_evicts_cv);
- cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
- }
- mutex_exit(&arc_dnlc_evicts_lock);
-
- arc_initialized = B_FALSE;
-
- if (arc_ksp != NULL) {
- kstat_delete(arc_ksp);
- arc_ksp = NULL;
- }
-
- taskq_wait(arc_prune_taskq);
- taskq_destroy(arc_prune_taskq);
-
- mutex_enter(&arc_prune_mtx);
- while ((p = list_head(&arc_prune_list)) != NULL) {
- list_remove(&arc_prune_list, p);
- zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
- zfs_refcount_destroy(&p->p_refcnt);
- kmem_free(p, sizeof (*p));
- }
- mutex_exit(&arc_prune_mtx);
-
- list_destroy(&arc_prune_list);
- mutex_destroy(&arc_prune_mtx);
-
- (void) zthr_cancel(arc_adjust_zthr);
- zthr_destroy(arc_adjust_zthr);
-
- mutex_destroy(&arc_dnlc_evicts_lock);
- cv_destroy(&arc_dnlc_evicts_cv);
-
- (void) zthr_cancel(arc_reap_zthr);
- zthr_destroy(arc_reap_zthr);
-
- mutex_destroy(&arc_adjust_lock);
- cv_destroy(&arc_adjust_waiters_cv);
-
- /*
- * buf_fini() must proceed arc_state_fini() because buf_fin() may
- * trigger the release of kmem magazines, which can callback to
- * arc_space_return() which accesses aggsums freed in act_state_fini().
- */
- buf_fini();
- arc_state_fini();
-
- ASSERT0(arc_loaned_bytes);
-}
-
-/*
- * Level 2 ARC
- *
- * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
- * It uses dedicated storage devices to hold cached data, which are populated
- * using large infrequent writes. The main role of this cache is to boost
- * the performance of random read workloads. The intended L2ARC devices
- * include short-stroked disks, solid state disks, and other media with
- * substantially faster read latency than disk.
- *
- * +-----------------------+
- * | ARC |
- * +-----------------------+
- * | ^ ^
- * | | |
- * l2arc_feed_thread() arc_read()
- * | | |
- * | l2arc read |
- * V | |
- * +---------------+ |
- * | L2ARC | |
- * +---------------+ |
- * | ^ |
- * l2arc_write() | |
- * | | |
- * V | |
- * +-------+ +-------+
- * | vdev | | vdev |
- * | cache | | cache |
- * +-------+ +-------+
- * +=========+ .-----.
- * : L2ARC : |-_____-|
- * : devices : | Disks |
- * +=========+ `-_____-'
- *
- * Read requests are satisfied from the following sources, in order:
- *
- * 1) ARC
- * 2) vdev cache of L2ARC devices
- * 3) L2ARC devices
- * 4) vdev cache of disks
- * 5) disks
- *
- * Some L2ARC device types exhibit extremely slow write performance.
- * To accommodate for this there are some significant differences between
- * the L2ARC and traditional cache design:
- *
- * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
- * the ARC behave as usual, freeing buffers and placing headers on ghost
- * lists. The ARC does not send buffers to the L2ARC during eviction as
- * this would add inflated write latencies for all ARC memory pressure.
- *
- * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
- * It does this by periodically scanning buffers from the eviction-end of
- * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there. It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction. If a compressible buffer is
- * found during scanning and selected for writing to an L2ARC device, we
- * temporarily boost scanning headroom during the next scan cycle to make
- * sure we adapt to compression effects (which might significantly reduce
- * the data volume we write to L2ARC). The thread that does this is
- * l2arc_feed_thread(), illustrated below; example sizes are included to
- * provide a better sense of ratio than this diagram:
- *
- * head --> tail
- * +---------------------+----------+
- * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
- * +---------------------+----------+ | o L2ARC eligible
- * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
- * +---------------------+----------+ |
- * 15.9 Gbytes ^ 32 Mbytes |
- * headroom |
- * l2arc_feed_thread()
- * |
- * l2arc write hand <--[oooo]--'
- * | 8 Mbyte
- * | write max
- * V
- * +==============================+
- * L2ARC dev |####|#|###|###| |####| ... |
- * +==============================+
- * 32 Gbytes
- *
- * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
- * evicted, then the L2ARC has cached a buffer much sooner than it probably
- * needed to, potentially wasting L2ARC device bandwidth and storage. It is
- * safe to say that this is an uncommon case, since buffers at the end of
- * the ARC lists have moved there due to inactivity.
- *
- * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
- * then the L2ARC simply misses copying some buffers. This serves as a
- * pressure valve to prevent heavy read workloads from both stalling the ARC
- * with waits and clogging the L2ARC with writes. This also helps prevent
- * the potential for the L2ARC to churn if it attempts to cache content too
- * quickly, such as during backups of the entire pool.
- *
- * 5. After system boot and before the ARC has filled main memory, there are
- * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
- * lists can remain mostly static. Instead of searching from tail of these
- * lists as pictured, the l2arc_feed_thread() will search from the list heads
- * for eligible buffers, greatly increasing its chance of finding them.
- *
- * The L2ARC device write speed is also boosted during this time so that
- * the L2ARC warms up faster. Since there have been no ARC evictions yet,
- * there are no L2ARC reads, and no fear of degrading read performance
- * through increased writes.
- *
- * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
- * the vdev queue can aggregate them into larger and fewer writes. Each
- * device is written to in a rotor fashion, sweeping writes through
- * available space then repeating.
- *
- * 7. The L2ARC does not store dirty content. It never needs to flush
- * write buffers back to disk based storage.
- *
- * 8. If an ARC buffer is written (and dirtied) which also exists in the
- * L2ARC, the now stale L2ARC buffer is immediately dropped.
- *
- * The performance of the L2ARC can be tweaked by a number of tunables, which
- * may be necessary for different workloads:
- *
- * l2arc_write_max max write bytes per interval
- * l2arc_write_boost extra write bytes during device warmup
- * l2arc_noprefetch skip caching prefetched buffers
- * l2arc_headroom number of max device writes to precache
- * l2arc_headroom_boost when we find compressed buffers during ARC
- * scanning, we multiply headroom by this
- * percentage factor for the next scan cycle,
- * since more compressed buffers are likely to
- * be present
- * l2arc_feed_secs seconds between L2ARC writing
- *
- * Tunables may be removed or added as future performance improvements are
- * integrated, and also may become zpool properties.
- *
- * There are three key functions that control how the L2ARC warms up:
- *
- * l2arc_write_eligible() check if a buffer is eligible to cache
- * l2arc_write_size() calculate how much to write
- * l2arc_write_interval() calculate sleep delay between writes
- *
- * These three functions determine what to write, how much, and how quickly
- * to send writes.
- */
-
-static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
-{
- /*
- * A buffer is *not* eligible for the L2ARC if it:
- * 1. belongs to a different spa.
- * 2. is already cached on the L2ARC.
- * 3. has an I/O in progress (it may be an incomplete read).
- * 4. is flagged not eligible (zfs property).
- */
- if (hdr->b_spa != spa_guid) {
- ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
- return (B_FALSE);
- }
- if (HDR_HAS_L2HDR(hdr)) {
- ARCSTAT_BUMP(arcstat_l2_write_in_l2);
- return (B_FALSE);
- }
- if (HDR_IO_IN_PROGRESS(hdr)) {
- ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
- return (B_FALSE);
- }
- if (!HDR_L2CACHE(hdr)) {
- ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
- return (B_FALSE);
- }
-
- return (B_TRUE);
-}
-
-static uint64_t
-l2arc_write_size(void)
-{
- uint64_t size;
-
- /*
- * Make sure our globals have meaningful values in case the user
- * altered them.
- */
- size = l2arc_write_max;
- if (size == 0) {
- cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
- "be greater than zero, resetting it to the default (%d)",
- L2ARC_WRITE_SIZE);
- size = l2arc_write_max = L2ARC_WRITE_SIZE;
- }
-
- if (arc_warm == B_FALSE)
- size += l2arc_write_boost;
-
- return (size);
-
-}
-
-static clock_t
-l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
-{
- clock_t interval, next, now;
-
- /*
- * If the ARC lists are busy, increase our write rate; if the
- * lists are stale, idle back. This is achieved by checking
- * how much we previously wrote - if it was more than half of
- * what we wanted, schedule the next write much sooner.
- */
- if (l2arc_feed_again && wrote > (wanted / 2))
- interval = (hz * l2arc_feed_min_ms) / 1000;
- else
- interval = hz * l2arc_feed_secs;
-
- now = ddi_get_lbolt();
- next = MAX(now, MIN(now + interval, began + interval));
-
- return (next);
-}
-
-/*
- * Cycle through L2ARC devices. This is how L2ARC load balances.
- * If a device is returned, this also returns holding the spa config lock.
- */
-static l2arc_dev_t *
-l2arc_dev_get_next(void)
-{
- l2arc_dev_t *first, *next = NULL;
-
- /*
- * Lock out the removal of spas (spa_namespace_lock), then removal
- * of cache devices (l2arc_dev_mtx). Once a device has been selected,
- * both locks will be dropped and a spa config lock held instead.
- */
- mutex_enter(&spa_namespace_lock);
- mutex_enter(&l2arc_dev_mtx);
-
- /* if there are no vdevs, there is nothing to do */
- if (l2arc_ndev == 0)
- goto out;
-
- first = NULL;
- next = l2arc_dev_last;
- do {
- /* loop around the list looking for a non-faulted vdev */
- if (next == NULL) {
- next = list_head(l2arc_dev_list);
- } else {
- next = list_next(l2arc_dev_list, next);
- if (next == NULL)
- next = list_head(l2arc_dev_list);
- }
-
- /* if we have come back to the start, bail out */
- if (first == NULL)
- first = next;
- else if (next == first)
- break;
-
- } while (vdev_is_dead(next->l2ad_vdev));
-
- /* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
- next = NULL;
-
- l2arc_dev_last = next;
-
-out:
- mutex_exit(&l2arc_dev_mtx);
-
- /*
- * Grab the config lock to prevent the 'next' device from being
- * removed while we are writing to it.
- */
- if (next != NULL)
- spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
- mutex_exit(&spa_namespace_lock);
-
- return (next);
-}
-
-/*
- * Free buffers that were tagged for destruction.
- */
-static void
-l2arc_do_free_on_write()
-{
- list_t *buflist;
- l2arc_data_free_t *df, *df_prev;
-
- mutex_enter(&l2arc_free_on_write_mtx);
- buflist = l2arc_free_on_write;
-
- for (df = list_tail(buflist); df; df = df_prev) {
- df_prev = list_prev(buflist, df);
- ASSERT3P(df->l2df_abd, !=, NULL);
- abd_free(df->l2df_abd);
- list_remove(buflist, df);
- kmem_free(df, sizeof (l2arc_data_free_t));
- }
-
- mutex_exit(&l2arc_free_on_write_mtx);
-}
-
-/*
- * A write to a cache device has completed. Update all headers to allow
- * reads from these buffers to begin.
- */
-static void
-l2arc_write_done(zio_t *zio)
-{
- l2arc_write_callback_t *cb;
- l2arc_dev_t *dev;
- list_t *buflist;
- arc_buf_hdr_t *head, *hdr, *hdr_prev;
- kmutex_t *hash_lock;
- int64_t bytes_dropped = 0;
-
- cb = zio->io_private;
- ASSERT3P(cb, !=, NULL);
- dev = cb->l2wcb_dev;
- ASSERT3P(dev, !=, NULL);
- head = cb->l2wcb_head;
- ASSERT3P(head, !=, NULL);
- buflist = &dev->l2ad_buflist;
- ASSERT3P(buflist, !=, NULL);
- DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
- l2arc_write_callback_t *, cb);
-
- if (zio->io_error != 0)
- ARCSTAT_BUMP(arcstat_l2_writes_error);
-
- /*
- * All writes completed, or an error was hit.
- */
-top:
- mutex_enter(&dev->l2ad_mtx);
- for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
- hdr_prev = list_prev(buflist, hdr);
-
- hash_lock = HDR_LOCK(hdr);
-
- /*
- * We cannot use mutex_enter or else we can deadlock
- * with l2arc_write_buffers (due to swapping the order
- * the hash lock and l2ad_mtx are taken).
- */
- if (!mutex_tryenter(hash_lock)) {
- /*
- * Missed the hash lock. We must retry so we
- * don't leave the ARC_FLAG_L2_WRITING bit set.
- */
- ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
-
- /*
- * We don't want to rescan the headers we've
- * already marked as having been written out, so
- * we reinsert the head node so we can pick up
- * where we left off.
- */
- list_remove(buflist, head);
- list_insert_after(buflist, hdr, head);
-
- mutex_exit(&dev->l2ad_mtx);
-
- /*
- * We wait for the hash lock to become available
- * to try and prevent busy waiting, and increase
- * the chance we'll be able to acquire the lock
- * the next time around.
- */
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
-
- /*
- * We could not have been moved into the arc_l2c_only
- * state while in-flight due to our ARC_FLAG_L2_WRITING
- * bit being set. Let's just ensure that's being enforced.
- */
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- if (zio->io_error != 0) {
- /*
- * Error - drop L2ARC entry.
- */
- list_remove(buflist, hdr);
- l2arc_trim(hdr);
- arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
- ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
- bytes_dropped += arc_hdr_size(hdr);
- (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
- arc_hdr_size(hdr), hdr);
- }
-
- /*
- * Allow ARC to begin reads and ghost list evictions to
- * this L2ARC entry.
- */
- arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
-
- mutex_exit(hash_lock);
- }
-
- atomic_inc_64(&l2arc_writes_done);
- list_remove(buflist, head);
- ASSERT(!HDR_HAS_L1HDR(head));
- kmem_cache_free(hdr_l2only_cache, head);
- mutex_exit(&dev->l2ad_mtx);
-
- vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
-
- l2arc_do_free_on_write();
-
- kmem_free(cb, sizeof (l2arc_write_callback_t));
-}
-
-/*
- * A read to a cache device completed. Validate buffer contents before
- * handing over to the regular ARC routines.
- */
-static void
-l2arc_read_done(zio_t *zio)
-{
- l2arc_read_callback_t *cb;
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
- boolean_t valid_cksum;
-
- ASSERT3P(zio->io_vd, !=, NULL);
- ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
-
- spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
-
- cb = zio->io_private;
- ASSERT3P(cb, !=, NULL);
- hdr = cb->l2rcb_hdr;
- ASSERT3P(hdr, !=, NULL);
-
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-
- /*
- * If the data was read into a temporary buffer,
- * move it and free the buffer.
- */
- if (cb->l2rcb_abd != NULL) {
- ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
- if (zio->io_error == 0) {
- abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
- arc_hdr_size(hdr));
- }
-
- /*
- * The following must be done regardless of whether
- * there was an error:
- * - free the temporary buffer
- * - point zio to the real ARC buffer
- * - set zio size accordingly
- * These are required because zio is either re-used for
- * an I/O of the block in the case of the error
- * or the zio is passed to arc_read_done() and it
- * needs real data.
- */
- abd_free(cb->l2rcb_abd);
- zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
- zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
- }
-
- ASSERT3P(zio->io_abd, !=, NULL);
-
- /*
- * Check this survived the L2ARC journey.
- */
- ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
- zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
- zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
-
- valid_cksum = arc_cksum_is_equal(hdr, zio);
- if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
- mutex_exit(hash_lock);
- zio->io_private = hdr;
- arc_read_done(zio);
- } else {
- /*
- * Buffer didn't survive caching. Increment stats and
- * reissue to the original storage device.
- */
- if (zio->io_error != 0) {
- ARCSTAT_BUMP(arcstat_l2_io_error);
- } else {
- zio->io_error = SET_ERROR(EIO);
- }
- if (!valid_cksum)
- ARCSTAT_BUMP(arcstat_l2_cksum_bad);
-
- /*
- * If there's no waiter, issue an async i/o to the primary
- * storage now. If there *is* a waiter, the caller must
- * issue the i/o in a context where it's OK to block.
- */
- if (zio->io_waiter == NULL) {
- zio_t *pio = zio_unique_parent(zio);
-
- ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
-
- zio = zio_read(pio, zio->io_spa, zio->io_bp,
- hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
- hdr, zio->io_priority, cb->l2rcb_flags,
- &cb->l2rcb_zb);
- for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
- acb != NULL; acb = acb->acb_next)
- acb->acb_zio_head = zio;
- mutex_exit(hash_lock);
- zio_nowait(zio);
- } else
- mutex_exit(hash_lock);
- }
-
- kmem_free(cb, sizeof (l2arc_read_callback_t));
-}
-
-/*
- * This is the list priority from which the L2ARC will search for pages to
- * cache. This is used within loops (0..3) to cycle through lists in the
- * desired order. This order can have a significant effect on cache
- * performance.
- *
- * Currently the metadata lists are hit first, MFU then MRU, followed by
- * the data lists. This function returns a locked list, and also returns
- * the lock pointer.
- */
-static multilist_sublist_t *
-l2arc_sublist_lock(int list_num)
-{
- multilist_t *ml = NULL;
- unsigned int idx;
-
- ASSERT(list_num >= 0 && list_num <= 3);
-
- switch (list_num) {
- case 0:
- ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
- break;
- case 1:
- ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
- break;
- case 2:
- ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
- break;
- case 3:
- ml = arc_mru->arcs_list[ARC_BUFC_DATA];
- break;
- }
-
- /*
- * Return a randomly-selected sublist. This is acceptable
- * because the caller feeds only a little bit of data for each
- * call (8MB). Subsequent calls will result in different
- * sublists being selected.
- */
- idx = multilist_get_random_index(ml);
- return (multilist_sublist_lock(ml, idx));
-}
-
-/*
- * Evict buffers from the device write hand to the distance specified in
- * bytes. This distance may span populated buffers, it may span nothing.
- * This is clearing a region on the L2ARC device ready for writing.
- * If the 'all' boolean is set, every buffer is evicted.
- */
-static void
-l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
-{
- list_t *buflist;
- arc_buf_hdr_t *hdr, *hdr_prev;
- kmutex_t *hash_lock;
- uint64_t taddr;
-
- buflist = &dev->l2ad_buflist;
-
- if (!all && dev->l2ad_first) {
- /*
- * This is the first sweep through the device. There is
- * nothing to evict.
- */
- return;
- }
-
- if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
- /*
- * When nearing the end of the device, evict to the end
- * before the device write hand jumps to the start.
- */
- taddr = dev->l2ad_end;
- } else {
- taddr = dev->l2ad_hand + distance;
- }
- DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
- uint64_t, taddr, boolean_t, all);
-
-top:
- mutex_enter(&dev->l2ad_mtx);
- for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
- hdr_prev = list_prev(buflist, hdr);
-
- hash_lock = HDR_LOCK(hdr);
-
- /*
- * We cannot use mutex_enter or else we can deadlock
- * with l2arc_write_buffers (due to swapping the order
- * the hash lock and l2ad_mtx are taken).
- */
- if (!mutex_tryenter(hash_lock)) {
- /*
- * Missed the hash lock. Retry.
- */
- ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
- mutex_exit(&dev->l2ad_mtx);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
-
- /*
- * A header can't be on this list if it doesn't have L2 header.
- */
- ASSERT(HDR_HAS_L2HDR(hdr));
-
- /* Ensure this header has finished being written. */
- ASSERT(!HDR_L2_WRITING(hdr));
- ASSERT(!HDR_L2_WRITE_HEAD(hdr));
-
- if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
- hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
- /*
- * We've evicted to the target address,
- * or the end of the device.
- */
- mutex_exit(hash_lock);
- break;
- }
-
- if (!HDR_HAS_L1HDR(hdr)) {
- ASSERT(!HDR_L2_READING(hdr));
- /*
- * This doesn't exist in the ARC. Destroy.
- * arc_hdr_destroy() will call list_remove()
- * and decrement arcstat_l2_lsize.
- */
- arc_change_state(arc_anon, hdr, hash_lock);
- arc_hdr_destroy(hdr);
- } else {
- ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
- ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
- /*
- * Invalidate issued or about to be issued
- * reads, since we may be about to write
- * over this location.
- */
- if (HDR_L2_READING(hdr)) {
- ARCSTAT_BUMP(arcstat_l2_evict_reading);
- arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
- }
-
- arc_hdr_l2hdr_destroy(hdr);
- }
- mutex_exit(hash_lock);
- }
- mutex_exit(&dev->l2ad_mtx);
-}
-
-/*
- * Find and write ARC buffers to the L2ARC device.
- *
- * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
- * for reading until they have completed writing.
- * The headroom_boost is an in-out parameter used to maintain headroom boost
- * state between calls to this function.
- *
- * Returns the number of bytes actually written (which may be smaller than
- * the delta by which the device hand has changed due to alignment).
- */
-static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
-{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
- l2arc_write_callback_t *cb;
- zio_t *pio, *wzio;
- uint64_t guid = spa_load_guid(spa);
- int try;
-
- ASSERT3P(dev->l2ad_vdev, !=, NULL);
-
- pio = NULL;
- write_lsize = write_asize = write_psize = 0;
- full = B_FALSE;
- head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
- arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
-
- ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
- /*
- * Copy buffers for L2ARC writing.
- */
- for (try = 0; try <= 3; try++) {
- multilist_sublist_t *mls = l2arc_sublist_lock(try);
- uint64_t passed_sz = 0;
-
- ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
-
- /*
- * L2ARC fast warmup.
- *
- * Until the ARC is warm and starts to evict, read from the
- * head of the ARC lists rather than the tail.
- */
- if (arc_warm == B_FALSE)
- hdr = multilist_sublist_head(mls);
- else
- hdr = multilist_sublist_tail(mls);
- if (hdr == NULL)
- ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
-
- headroom = target_sz * l2arc_headroom;
- if (zfs_compressed_arc_enabled)
- headroom = (headroom * l2arc_headroom_boost) / 100;
-
- for (; hdr; hdr = hdr_prev) {
- kmutex_t *hash_lock;
-
- if (arc_warm == B_FALSE)
- hdr_prev = multilist_sublist_next(mls, hdr);
- else
- hdr_prev = multilist_sublist_prev(mls, hdr);
- ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
- HDR_GET_LSIZE(hdr));
-
- hash_lock = HDR_LOCK(hdr);
- if (!mutex_tryenter(hash_lock)) {
- ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
- /*
- * Skip this buffer rather than waiting.
- */
- continue;
- }
-
- passed_sz += HDR_GET_LSIZE(hdr);
- if (passed_sz > headroom) {
- /*
- * Searched too far.
- */
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
- break;
- }
-
- if (!l2arc_write_eligible(guid, hdr)) {
- mutex_exit(hash_lock);
- continue;
- }
-
- /*
- * We rely on the L1 portion of the header below, so
- * it's invalid for this header to have been evicted out
- * of the ghost cache, prior to being written out. The
- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
- */
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- ASSERT3U(arc_hdr_size(hdr), >, 0);
- uint64_t psize = arc_hdr_size(hdr);
- uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
- psize);
-
- if ((write_asize + asize) > target_sz) {
- full = B_TRUE;
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_l2_write_full);
- break;
- }
-
- if (pio == NULL) {
- /*
- * Insert a dummy header on the buflist so
- * l2arc_write_done() can find where the
- * write buffers begin without searching.
- */
- mutex_enter(&dev->l2ad_mtx);
- list_insert_head(&dev->l2ad_buflist, head);
- mutex_exit(&dev->l2ad_mtx);
-
- cb = kmem_alloc(
- sizeof (l2arc_write_callback_t), KM_SLEEP);
- cb->l2wcb_dev = dev;
- cb->l2wcb_head = head;
- pio = zio_root(spa, l2arc_write_done, cb,
- ZIO_FLAG_CANFAIL);
- ARCSTAT_BUMP(arcstat_l2_write_pios);
- }
-
- hdr->b_l2hdr.b_dev = dev;
- hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- arc_hdr_set_flags(hdr,
- ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
-
- mutex_enter(&dev->l2ad_mtx);
- list_insert_head(&dev->l2ad_buflist, hdr);
- mutex_exit(&dev->l2ad_mtx);
-
- (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize,
- hdr);
-
- /*
- * Normally the L2ARC can use the hdr's data, but if
- * we're sharing data between the hdr and one of its
- * bufs, L2ARC needs its own copy of the data so that
- * the ZIO below can't race with the buf consumer.
- * Another case where we need to create a copy of the
- * data is when the buffer size is not device-aligned
- * and we need to pad the block to make it such.
- * That also keeps the clock hand suitably aligned.
- *
- * To ensure that the copy will be available for the
- * lifetime of the ZIO and be cleaned up afterwards, we
- * add it to the l2arc_free_on_write queue.
- */
- abd_t *to_write;
- if (!HDR_SHARED_DATA(hdr) && psize == asize) {
- to_write = hdr->b_l1hdr.b_pabd;
- } else {
- to_write = abd_alloc_for_io(asize,
- HDR_ISTYPE_METADATA(hdr));
- abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
- if (asize != psize) {
- abd_zero_off(to_write, psize,
- asize - psize);
- }
- l2arc_free_abd_on_write(to_write, asize,
- arc_buf_type(hdr));
- }
- wzio = zio_write_phys(pio, dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr, asize, to_write,
- ZIO_CHECKSUM_OFF, NULL, hdr,
- ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_CANFAIL, B_FALSE);
-
- write_lsize += HDR_GET_LSIZE(hdr);
- DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
- zio_t *, wzio);
-
- write_psize += psize;
- write_asize += asize;
- dev->l2ad_hand += asize;
-
- mutex_exit(hash_lock);
-
- (void) zio_nowait(wzio);
- }
-
- multilist_sublist_unlock(mls);
-
- if (full == B_TRUE)
- break;
- }
-
- /* No buffers selected for writing? */
- if (pio == NULL) {
- ASSERT0(write_lsize);
- ASSERT(!HDR_HAS_L1HDR(head));
- kmem_cache_free(hdr_l2only_cache, head);
- return (0);
- }
-
- ASSERT3U(write_psize, <=, target_sz);
- ARCSTAT_BUMP(arcstat_l2_writes_sent);
- ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
- ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
- ARCSTAT_INCR(arcstat_l2_psize, write_psize);
- vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
-
- /*
- * Bump device hand to the device start if it is approaching the end.
- * l2arc_evict() will already have evicted ahead for this case.
- */
- if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- dev->l2ad_hand = dev->l2ad_start;
- dev->l2ad_first = B_FALSE;
- }
-
- dev->l2ad_writing = B_TRUE;
- (void) zio_wait(pio);
- dev->l2ad_writing = B_FALSE;
-
- return (write_asize);
-}
-
-/*
- * This thread feeds the L2ARC at regular intervals. This is the beating
- * heart of the L2ARC.
- */
-/* ARGSUSED */
-static void
-l2arc_feed_thread(void *unused __unused)
-{
- callb_cpr_t cpr;
- l2arc_dev_t *dev;
- spa_t *spa;
- uint64_t size, wrote;
- clock_t begin, next = ddi_get_lbolt();
-
- CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
-
- mutex_enter(&l2arc_feed_thr_lock);
-
- while (l2arc_thread_exit == 0) {
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- next - ddi_get_lbolt());
- CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
- next = ddi_get_lbolt() + hz;
-
- /*
- * Quick check for L2ARC devices.
- */
- mutex_enter(&l2arc_dev_mtx);
- if (l2arc_ndev == 0) {
- mutex_exit(&l2arc_dev_mtx);
- continue;
- }
- mutex_exit(&l2arc_dev_mtx);
- begin = ddi_get_lbolt();
-
- /*
- * This selects the next l2arc device to write to, and in
- * doing so the next spa to feed from: dev->l2ad_spa. This
- * will return NULL if there are now no l2arc devices or if
- * they are all faulted.
- *
- * If a device is returned, its spa's config lock is also
- * held to prevent device removal. l2arc_dev_get_next()
- * will grab and release l2arc_dev_mtx.
- */
- if ((dev = l2arc_dev_get_next()) == NULL)
- continue;
-
- spa = dev->l2ad_spa;
- ASSERT3P(spa, !=, NULL);
-
- /*
- * If the pool is read-only then force the feed thread to
- * sleep a little longer.
- */
- if (!spa_writeable(spa)) {
- next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
- spa_config_exit(spa, SCL_L2ARC, dev);
- continue;
- }
-
- /*
- * Avoid contributing to memory pressure.
- */
- if (arc_reclaim_needed()) {
- ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
- spa_config_exit(spa, SCL_L2ARC, dev);
- continue;
- }
-
- ARCSTAT_BUMP(arcstat_l2_feeds);
-
- size = l2arc_write_size();
-
- /*
- * Evict L2ARC buffers that will be overwritten.
- */
- l2arc_evict(dev, size, B_FALSE);
-
- /*
- * Write ARC buffers.
- */
- wrote = l2arc_write_buffers(spa, dev, size);
-
- /*
- * Calculate interval between writes.
- */
- next = l2arc_write_interval(begin, size, wrote);
- spa_config_exit(spa, SCL_L2ARC, dev);
- }
-
- l2arc_thread_exit = 0;
- cv_broadcast(&l2arc_feed_thr_cv);
- CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
- thread_exit();
-}
-
-boolean_t
-l2arc_vdev_present(vdev_t *vd)
-{
- l2arc_dev_t *dev;
-
- mutex_enter(&l2arc_dev_mtx);
- for (dev = list_head(l2arc_dev_list); dev != NULL;
- dev = list_next(l2arc_dev_list, dev)) {
- if (dev->l2ad_vdev == vd)
- break;
- }
- mutex_exit(&l2arc_dev_mtx);
-
- return (dev != NULL);
-}
-
-/*
- * Add a vdev for use by the L2ARC. By this point the spa has already
- * validated the vdev and opened it.
- */
-void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
-{
- l2arc_dev_t *adddev;
-
- ASSERT(!l2arc_vdev_present(vd));
-
- vdev_ashift_optimize(vd);
-
- /*
- * Create a new l2arc device entry.
- */
- adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
- adddev->l2ad_spa = spa;
- adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
- adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
- adddev->l2ad_hand = adddev->l2ad_start;
- adddev->l2ad_first = B_TRUE;
- adddev->l2ad_writing = B_FALSE;
-
- mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
- /*
- * This is a list of all ARC buffers that are still valid on the
- * device.
- */
- list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
-
- vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
- zfs_refcount_create(&adddev->l2ad_alloc);
-
- /*
- * Add device to global list
- */
- mutex_enter(&l2arc_dev_mtx);
- list_insert_head(l2arc_dev_list, adddev);
- atomic_inc_64(&l2arc_ndev);
- mutex_exit(&l2arc_dev_mtx);
-}
-
-/*
- * Remove a vdev from the L2ARC.
- */
-void
-l2arc_remove_vdev(vdev_t *vd)
-{
- l2arc_dev_t *dev, *nextdev, *remdev = NULL;
-
- /*
- * Find the device by vdev
- */
- mutex_enter(&l2arc_dev_mtx);
- for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
- nextdev = list_next(l2arc_dev_list, dev);
- if (vd == dev->l2ad_vdev) {
- remdev = dev;
- break;
- }
- }
- ASSERT3P(remdev, !=, NULL);
-
- /*
- * Remove device from global list
- */
- list_remove(l2arc_dev_list, remdev);
- l2arc_dev_last = NULL; /* may have been invalidated */
- atomic_dec_64(&l2arc_ndev);
- mutex_exit(&l2arc_dev_mtx);
-
- /*
- * Clear all buflists and ARC references. L2ARC device flush.
- */
- l2arc_evict(remdev, 0, B_TRUE);
- list_destroy(&remdev->l2ad_buflist);
- mutex_destroy(&remdev->l2ad_mtx);
- zfs_refcount_destroy(&remdev->l2ad_alloc);
- kmem_free(remdev, sizeof (l2arc_dev_t));
-}
-
-void
-l2arc_init(void)
-{
- l2arc_thread_exit = 0;
- l2arc_ndev = 0;
- l2arc_writes_sent = 0;
- l2arc_writes_done = 0;
-
- mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- l2arc_dev_list = &L2ARC_dev_list;
- l2arc_free_on_write = &L2ARC_free_on_write;
- list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
- offsetof(l2arc_dev_t, l2ad_node));
- list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
- offsetof(l2arc_data_free_t, l2df_list_node));
-}
-
-void
-l2arc_fini(void)
-{
- /*
- * This is called from dmu_fini(), which is called from spa_fini();
- * Because of this, we can assume that all l2arc devices have
- * already been removed when the pools themselves were removed.
- */
-
- l2arc_do_free_on_write();
-
- mutex_destroy(&l2arc_feed_thr_lock);
- cv_destroy(&l2arc_feed_thr_cv);
- mutex_destroy(&l2arc_dev_mtx);
- mutex_destroy(&l2arc_free_on_write_mtx);
-
- list_destroy(l2arc_dev_list);
- list_destroy(l2arc_free_on_write);
-}
-
-void
-l2arc_start(void)
-{
- if (!(spa_mode_global & FWRITE))
- return;
-
- (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
- TS_RUN, minclsyspri);
-}
-
-void
-l2arc_stop(void)
-{
- if (!(spa_mode_global & FWRITE))
- return;
-
- mutex_enter(&l2arc_feed_thr_lock);
- cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
- l2arc_thread_exit = 1;
- while (l2arc_thread_exit != 0)
- cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
- mutex_exit(&l2arc_feed_thr_lock);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
deleted file mode 100644
index d7a7fdb0e1b1..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/zio_compress.h>
-
-/*
- * Embedded-data Block Pointers
- *
- * Normally, block pointers point (via their DVAs) to a block which holds data.
- * If the data that we need to store is very small, this is an inefficient
- * use of space, because a block must be at minimum 1 sector (typically 512
- * bytes or 4KB). Additionally, reading these small blocks tends to generate
- * more random reads.
- *
- * Embedded-data Block Pointers allow small pieces of data (the "payload",
- * up to 112 bytes) to be stored in the block pointer itself, instead of
- * being pointed to. The "Pointer" part of this name is a bit of a
- * misnomer, as nothing is pointed to.
- *
- * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
- * be embedded in the block pointer. The logic for this is handled in
- * the SPA, by the zio pipeline. Therefore most code outside the zio
- * pipeline doesn't need special-cases to handle these block pointers.
- *
- * See spa.h for details on the exact layout of embedded block pointers.
- */
-
-void
-encode_embedded_bp_compressed(blkptr_t *bp, void *data,
- enum zio_compress comp, int uncompressed_size, int compressed_size)
-{
- uint64_t *bp64 = (uint64_t *)bp;
- uint64_t w = 0;
- uint8_t *data8 = data;
-
- ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
- ASSERT(uncompressed_size == compressed_size ||
- comp != ZIO_COMPRESS_OFF);
- ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
- ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
-
- bzero(bp, sizeof (*bp));
- BP_SET_EMBEDDED(bp, B_TRUE);
- BP_SET_COMPRESS(bp, comp);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
- BPE_SET_LSIZE(bp, uncompressed_size);
- BPE_SET_PSIZE(bp, compressed_size);
-
- /*
- * Encode the byte array into the words of the block pointer.
- * First byte goes into low bits of first word (little endian).
- */
- for (int i = 0; i < compressed_size; i++) {
- BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
- if (i % sizeof (w) == sizeof (w) - 1) {
- /* we've reached the end of a word */
- ASSERT3P(bp64, <, bp + 1);
- *bp64 = w;
- bp64++;
- if (!BPE_IS_PAYLOADWORD(bp, bp64))
- bp64++;
- w = 0;
- }
- }
- /* write last partial word */
- if (bp64 < (uint64_t *)(bp + 1))
- *bp64 = w;
-}
-
-/*
- * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
- * more than BPE_PAYLOAD_SIZE bytes).
- */
-void
-decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
-{
- int psize;
- uint8_t *buf8 = buf;
- uint64_t w = 0;
- const uint64_t *bp64 = (const uint64_t *)bp;
-
- ASSERT(BP_IS_EMBEDDED(bp));
-
- psize = BPE_GET_PSIZE(bp);
-
- /*
- * Decode the words of the block pointer into the byte array.
- * Low bits of first word are the first byte (little endian).
- */
- for (int i = 0; i < psize; i++) {
- if (i % sizeof (w) == 0) {
- /* beginning of a word */
- ASSERT3P(bp64, <, bp + 1);
- w = *bp64;
- bp64++;
- if (!BPE_IS_PAYLOADWORD(bp, bp64))
- bp64++;
- }
- buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
- }
-}
-
-/*
- * Fill in the buffer with the (decompressed) payload of the embedded
- * blkptr_t. Takes into account compression and byteorder (the payload is
- * treated as a stream of bytes).
- * Return 0 on success, or ENOSPC if it won't fit in the buffer.
- */
-int
-decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
-{
- int lsize, psize;
-
- ASSERT(BP_IS_EMBEDDED(bp));
-
- lsize = BPE_GET_LSIZE(bp);
- psize = BPE_GET_PSIZE(bp);
-
- if (lsize > buflen)
- return (ENOSPC);
- ASSERT3U(lsize, ==, buflen);
-
- if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
- uint8_t dstbuf[BPE_PAYLOAD_SIZE];
- decode_embedded_bp_compressed(bp, dstbuf);
- VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
- dstbuf, buf, psize, buflen));
- } else {
- ASSERT3U(lsize, ==, psize);
- decode_embedded_bp_compressed(bp, buf);
- }
-
- return (0);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
deleted file mode 100644
index ee12db3a266d..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#include <sys/bplist.h>
-#include <sys/zfs_context.h>
-
-
-void
-bplist_create(bplist_t *bpl)
-{
- mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
- offsetof(bplist_entry_t, bpe_node));
-}
-
-void
-bplist_destroy(bplist_t *bpl)
-{
- list_destroy(&bpl->bpl_list);
- mutex_destroy(&bpl->bpl_lock);
-}
-
-void
-bplist_append(bplist_t *bpl, const blkptr_t *bp)
-{
- bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
-
- mutex_enter(&bpl->bpl_lock);
- bpe->bpe_blk = *bp;
- list_insert_tail(&bpl->bpl_list, bpe);
- mutex_exit(&bpl->bpl_lock);
-}
-
-/*
- * To aid debugging, we keep the most recently removed entry. This way if
- * we are in the callback, we can easily locate the entry.
- */
-static bplist_entry_t *bplist_iterate_last_removed;
-
-void
-bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
-{
- bplist_entry_t *bpe;
-
- mutex_enter(&bpl->bpl_lock);
- while (bpe = list_head(&bpl->bpl_list)) {
- bplist_iterate_last_removed = bpe;
- list_remove(&bpl->bpl_list, bpe);
- mutex_exit(&bpl->bpl_lock);
- func(arg, &bpe->bpe_blk, tx);
- kmem_free(bpe, sizeof (*bpe));
- mutex_enter(&bpl->bpl_lock);
- }
- mutex_exit(&bpl->bpl_lock);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
deleted file mode 100644
index bbdd765214fc..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017 Datto Inc.
- */
-
-#include <sys/bpobj.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfeature.h>
-#include <sys/zap.h>
-
-/*
- * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
- */
-uint64_t
-bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
-{
- spa_t *spa = dmu_objset_spa(os);
- dsl_pool_t *dp = dmu_objset_pool(os);
-
- if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
- if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
- ASSERT0(dp->dp_empty_bpobj);
- dp->dp_empty_bpobj =
- bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
- VERIFY(zap_add(os,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
- &dp->dp_empty_bpobj, tx) == 0);
- }
- spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
- ASSERT(dp->dp_empty_bpobj != 0);
- return (dp->dp_empty_bpobj);
- } else {
- return (bpobj_alloc(os, blocksize, tx));
- }
-}
-
-void
-bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = dmu_objset_pool(os);
-
- spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
- if (!spa_feature_is_active(dmu_objset_spa(os),
- SPA_FEATURE_EMPTY_BPOBJ)) {
- VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_EMPTY_BPOBJ, tx));
- VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
- dp->dp_empty_bpobj = 0;
- }
-}
-
-uint64_t
-bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
-{
- int size;
-
- if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
- size = BPOBJ_SIZE_V0;
- else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
- size = BPOBJ_SIZE_V1;
- else
- size = sizeof (bpobj_phys_t);
-
- return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
- DMU_OT_BPOBJ_HDR, size, tx));
-}
-
-void
-bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
- int64_t i;
- bpobj_t bpo;
- dmu_object_info_t doi;
- int epb;
- dmu_buf_t *dbuf = NULL;
-
- ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
- VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
-
- mutex_enter(&bpo.bpo_lock);
-
- if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
- goto out;
-
- VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
- epb = doi.doi_data_block_size / sizeof (uint64_t);
-
- for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
- uint64_t *objarray;
- uint64_t offset, blkoff;
-
- offset = i * sizeof (uint64_t);
- blkoff = P2PHASE(i, epb);
-
- if (dbuf == NULL || dbuf->db_offset > offset) {
- if (dbuf)
- dmu_buf_rele(dbuf, FTAG);
- VERIFY3U(0, ==, dmu_buf_hold(os,
- bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
- }
-
- ASSERT3U(offset, >=, dbuf->db_offset);
- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
- objarray = dbuf->db_data;
- bpobj_free(os, objarray[blkoff], tx);
- }
- if (dbuf) {
- dmu_buf_rele(dbuf, FTAG);
- dbuf = NULL;
- }
- VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
-
-out:
- mutex_exit(&bpo.bpo_lock);
- bpobj_close(&bpo);
-
- VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
-}
-
-int
-bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
-{
- dmu_object_info_t doi;
- int err;
-
- err = dmu_object_info(os, object, &doi);
- if (err)
- return (err);
-
- bzero(bpo, sizeof (*bpo));
- mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
-
- ASSERT(bpo->bpo_dbuf == NULL);
- ASSERT(bpo->bpo_phys == NULL);
- ASSERT(object != 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
-
- err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
- if (err)
- return (err);
-
- bpo->bpo_os = os;
- bpo->bpo_object = object;
- bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
- bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
- bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
- bpo->bpo_phys = bpo->bpo_dbuf->db_data;
- return (0);
-}
-
-boolean_t
-bpobj_is_open(const bpobj_t *bpo)
-{
- return (bpo->bpo_object != 0);
-}
-
-void
-bpobj_close(bpobj_t *bpo)
-{
- /* Lame workaround for closing a bpobj that was never opened. */
- if (bpo->bpo_object == 0)
- return;
-
- dmu_buf_rele(bpo->bpo_dbuf, bpo);
- if (bpo->bpo_cached_dbuf != NULL)
- dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
- bpo->bpo_dbuf = NULL;
- bpo->bpo_phys = NULL;
- bpo->bpo_cached_dbuf = NULL;
- bpo->bpo_object = 0;
-
- mutex_destroy(&bpo->bpo_lock);
-}
-
-boolean_t
-bpobj_is_empty(bpobj_t *bpo)
-{
- return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
- (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
-}
-
-static int
-bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
- boolean_t free)
-{
- dmu_object_info_t doi;
- int epb;
- int64_t i;
- int err = 0;
- dmu_buf_t *dbuf = NULL;
-
- ASSERT(bpobj_is_open(bpo));
- mutex_enter(&bpo->bpo_lock);
-
- if (free)
- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
-
- for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
- blkptr_t *bparray;
- blkptr_t *bp;
- uint64_t offset, blkoff;
-
- offset = i * sizeof (blkptr_t);
- blkoff = P2PHASE(i, bpo->bpo_epb);
-
- if (dbuf == NULL || dbuf->db_offset > offset) {
- if (dbuf)
- dmu_buf_rele(dbuf, FTAG);
- err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
- FTAG, &dbuf, 0);
- if (err)
- break;
- }
-
- ASSERT3U(offset, >=, dbuf->db_offset);
- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
- bparray = dbuf->db_data;
- bp = &bparray[blkoff];
- err = func(arg, bp, tx);
- if (err)
- break;
- if (free) {
- bpo->bpo_phys->bpo_bytes -=
- bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
- if (bpo->bpo_havecomp) {
- bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
- bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
- }
- bpo->bpo_phys->bpo_num_blkptrs--;
- ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
- }
- }
- if (dbuf) {
- dmu_buf_rele(dbuf, FTAG);
- dbuf = NULL;
- }
- if (free) {
- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
- (i + 1) * sizeof (blkptr_t), -1ULL, tx));
- }
- if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
- goto out;
-
- ASSERT(bpo->bpo_havecomp);
- err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
- if (err) {
- mutex_exit(&bpo->bpo_lock);
- return (err);
- }
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
- epb = doi.doi_data_block_size / sizeof (uint64_t);
-
- for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
- uint64_t *objarray;
- uint64_t offset, blkoff;
- bpobj_t sublist;
- uint64_t used_before, comp_before, uncomp_before;
- uint64_t used_after, comp_after, uncomp_after;
-
- offset = i * sizeof (uint64_t);
- blkoff = P2PHASE(i, epb);
-
- if (dbuf == NULL || dbuf->db_offset > offset) {
- if (dbuf)
- dmu_buf_rele(dbuf, FTAG);
- err = dmu_buf_hold(bpo->bpo_os,
- bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
- if (err)
- break;
- }
-
- ASSERT3U(offset, >=, dbuf->db_offset);
- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
- objarray = dbuf->db_data;
- err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
- if (err)
- break;
- if (free) {
- err = bpobj_space(&sublist,
- &used_before, &comp_before, &uncomp_before);
- if (err != 0) {
- bpobj_close(&sublist);
- break;
- }
- }
- err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
- if (free) {
- VERIFY3U(0, ==, bpobj_space(&sublist,
- &used_after, &comp_after, &uncomp_after));
- bpo->bpo_phys->bpo_bytes -= used_before - used_after;
- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
- bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
- bpo->bpo_phys->bpo_uncomp -=
- uncomp_before - uncomp_after;
- }
-
- bpobj_close(&sublist);
- if (err)
- break;
- if (free) {
- err = dmu_object_free(bpo->bpo_os,
- objarray[blkoff], tx);
- if (err)
- break;
- bpo->bpo_phys->bpo_num_subobjs--;
- ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
- }
- }
- if (dbuf) {
- dmu_buf_rele(dbuf, FTAG);
- dbuf = NULL;
- }
- if (free) {
- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
- bpo->bpo_phys->bpo_subobjs,
- (i + 1) * sizeof (uint64_t), -1ULL, tx));
- }
-
-out:
- /* If there are no entries, there should be no bytes. */
- if (bpobj_is_empty(bpo)) {
- ASSERT0(bpo->bpo_phys->bpo_bytes);
- ASSERT0(bpo->bpo_phys->bpo_comp);
- ASSERT0(bpo->bpo_phys->bpo_uncomp);
- }
-
- mutex_exit(&bpo->bpo_lock);
- return (err);
-}
-
-/*
- * Iterate and remove the entries. If func returns nonzero, iteration
- * will stop and that entry will not be removed.
- */
-int
-bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
-{
- return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
-}
-
-/*
- * Iterate the entries. If func returns nonzero, iteration will stop.
- */
-int
-bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
-{
- return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
-}
-
-void
-bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
-{
- bpobj_t subbpo;
- uint64_t used, comp, uncomp, subsubobjs;
-
- ASSERT(bpobj_is_open(bpo));
- ASSERT(subobj != 0);
- ASSERT(bpo->bpo_havesubobj);
- ASSERT(bpo->bpo_havecomp);
- ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
-
- if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
- bpobj_decr_empty(bpo->bpo_os, tx);
- return;
- }
-
- VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
- VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-
- if (bpobj_is_empty(&subbpo)) {
- /* No point in having an empty subobj. */
- bpobj_close(&subbpo);
- bpobj_free(bpo->bpo_os, subobj, tx);
- return;
- }
-
- mutex_enter(&bpo->bpo_lock);
- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
- if (bpo->bpo_phys->bpo_subobjs == 0) {
- bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
- DMU_OT_NONE, 0, tx);
- }
-
- dmu_object_info_t doi;
- ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
-
- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
- sizeof (subobj), &subobj, tx);
- bpo->bpo_phys->bpo_num_subobjs++;
-
- /*
- * If subobj has only one block of subobjs, then move subobj's
- * subobjs to bpo's subobj list directly. This reduces
- * recursion in bpobj_iterate due to nested subobjs.
- */
- subsubobjs = subbpo.bpo_phys->bpo_subobjs;
- if (subsubobjs != 0) {
- dmu_object_info_t doi;
-
- VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
- if (doi.doi_max_offset == doi.doi_data_block_size) {
- dmu_buf_t *subdb;
- uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
-
- VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
- 0, FTAG, &subdb, 0));
- /*
- * Make sure that we are not asking dmu_write()
- * to write more data than we have in our buffer.
- */
- VERIFY3U(subdb->db_size, >=,
- numsubsub * sizeof (subobj));
- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
- numsubsub * sizeof (subobj), subdb->db_data, tx);
- dmu_buf_rele(subdb, FTAG);
- bpo->bpo_phys->bpo_num_subobjs += numsubsub;
-
- dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
- subbpo.bpo_phys->bpo_subobjs = 0;
- VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
- subsubobjs, tx));
- }
- }
- bpo->bpo_phys->bpo_bytes += used;
- bpo->bpo_phys->bpo_comp += comp;
- bpo->bpo_phys->bpo_uncomp += uncomp;
- mutex_exit(&bpo->bpo_lock);
-
- bpobj_close(&subbpo);
-}
-
-void
-bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
-{
- blkptr_t stored_bp = *bp;
- uint64_t offset;
- int blkoff;
- blkptr_t *bparray;
-
- ASSERT(bpobj_is_open(bpo));
- ASSERT(!BP_IS_HOLE(bp));
- ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
-
- if (BP_IS_EMBEDDED(bp)) {
- /*
- * The bpobj will compress better without the payload.
- *
- * Note that we store EMBEDDED bp's because they have an
- * uncompressed size, which must be accounted for. An
- * alternative would be to add their size to bpo_uncomp
- * without storing the bp, but that would create additional
- * complications: bpo_uncomp would be inconsistent with the
- * set of BP's stored, and bpobj_iterate() wouldn't visit
- * all the space accounted for in the bpobj.
- */
- bzero(&stored_bp, sizeof (stored_bp));
- stored_bp.blk_prop = bp->blk_prop;
- stored_bp.blk_birth = bp->blk_birth;
- } else if (!BP_GET_DEDUP(bp)) {
- /* The bpobj will compress better without the checksum */
- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
- }
-
- /* We never need the fill count. */
- stored_bp.blk_fill = 0;
-
- mutex_enter(&bpo->bpo_lock);
-
- offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
- blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
-
- if (bpo->bpo_cached_dbuf == NULL ||
- offset < bpo->bpo_cached_dbuf->db_offset ||
- offset >= bpo->bpo_cached_dbuf->db_offset +
- bpo->bpo_cached_dbuf->db_size) {
- if (bpo->bpo_cached_dbuf)
- dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
- VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
- offset, bpo, &bpo->bpo_cached_dbuf, 0));
- }
-
- dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
- bparray = bpo->bpo_cached_dbuf->db_data;
- bparray[blkoff] = stored_bp;
-
- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
- bpo->bpo_phys->bpo_num_blkptrs++;
- bpo->bpo_phys->bpo_bytes +=
- bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
- if (bpo->bpo_havecomp) {
- bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
- bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
- }
- mutex_exit(&bpo->bpo_lock);
-}
-
-struct space_range_arg {
- spa_t *spa;
- uint64_t mintxg;
- uint64_t maxtxg;
- uint64_t used;
- uint64_t comp;
- uint64_t uncomp;
-};
-
-/* ARGSUSED */
-static int
-space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
- struct space_range_arg *sra = arg;
-
- if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
- if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
- sra->used += bp_get_dsize_sync(sra->spa, bp);
- else
- sra->used += bp_get_dsize(sra->spa, bp);
- sra->comp += BP_GET_PSIZE(bp);
- sra->uncomp += BP_GET_UCSIZE(bp);
- }
- return (0);
-}
-
-int
-bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- ASSERT(bpobj_is_open(bpo));
- mutex_enter(&bpo->bpo_lock);
-
- *usedp = bpo->bpo_phys->bpo_bytes;
- if (bpo->bpo_havecomp) {
- *compp = bpo->bpo_phys->bpo_comp;
- *uncompp = bpo->bpo_phys->bpo_uncomp;
- mutex_exit(&bpo->bpo_lock);
- return (0);
- } else {
- mutex_exit(&bpo->bpo_lock);
- return (bpobj_space_range(bpo, 0, UINT64_MAX,
- usedp, compp, uncompp));
- }
-}
-
-/*
- * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
- uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- struct space_range_arg sra = { 0 };
- int err;
-
- ASSERT(bpobj_is_open(bpo));
-
- /*
- * As an optimization, if they want the whole txg range, just
- * get bpo_bytes rather than iterating over the bps.
- */
- if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
- return (bpobj_space(bpo, usedp, compp, uncompp));
-
- sra.spa = dmu_objset_spa(bpo->bpo_os);
- sra.mintxg = mintxg;
- sra.maxtxg = maxtxg;
-
- err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
- *usedp = sra.used;
- *compp = sra.comp;
- *uncompp = sra.uncomp;
- return (err);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
deleted file mode 100644
index c74d07236c1b..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/arc.h>
-#include <sys/bptree.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dnode.h>
-#include <sys/refcount.h>
-#include <sys/spa.h>
-
-/*
- * A bptree is a queue of root block pointers from destroyed datasets. When a
- * dataset is destroyed its root block pointer is put on the end of the pool's
- * bptree queue so the dataset's blocks can be freed asynchronously by
- * dsl_scan_sync. This allows the delete operation to finish without traversing
- * all the dataset's blocks.
- *
- * Note that while bt_begin and bt_end are only ever incremented in this code,
- * they are effectively reset to 0 every time the entire bptree is freed because
- * the bptree's object is destroyed and re-created.
- */
-
-struct bptree_args {
- bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
- boolean_t ba_free; /* true if freeing during traversal */
-
- bptree_itor_t *ba_func; /* function to call for each blockpointer */
- void *ba_arg; /* caller supplied argument to ba_func */
- dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
-} bptree_args_t;
-
-uint64_t
-bptree_alloc(objset_t *os, dmu_tx_t *tx)
-{
- uint64_t obj;
- dmu_buf_t *db;
- bptree_phys_t *bt;
-
- obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
- SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
- sizeof (bptree_phys_t), tx);
-
- /*
- * Bonus buffer contents are already initialized to 0, but for
- * readability we make it explicit.
- */
- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- bt = db->db_data;
- bt->bt_begin = 0;
- bt->bt_end = 0;
- bt->bt_bytes = 0;
- bt->bt_comp = 0;
- bt->bt_uncomp = 0;
- dmu_buf_rele(db, FTAG);
-
- return (obj);
-}
-
-int
-bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
- dmu_buf_t *db;
- bptree_phys_t *bt;
-
- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
- bt = db->db_data;
- ASSERT3U(bt->bt_begin, ==, bt->bt_end);
- ASSERT0(bt->bt_bytes);
- ASSERT0(bt->bt_comp);
- ASSERT0(bt->bt_uncomp);
- dmu_buf_rele(db, FTAG);
-
- return (dmu_object_free(os, obj, tx));
-}
-
-boolean_t
-bptree_is_empty(objset_t *os, uint64_t obj)
-{
- dmu_buf_t *db;
- bptree_phys_t *bt;
- boolean_t rv;
-
- VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
- bt = db->db_data;
- rv = (bt->bt_begin == bt->bt_end);
- dmu_buf_rele(db, FTAG);
- return (rv);
-}
-
-void
-bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
- uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
-{
- dmu_buf_t *db;
- bptree_phys_t *bt;
- bptree_entry_phys_t bte = { 0 };
-
- /*
- * bptree objects are in the pool mos, therefore they can only be
- * modified in syncing context. Furthermore, this is only modified
- * by the sync thread, so no locking is necessary.
- */
- ASSERT(dmu_tx_is_syncing(tx));
-
- VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
- bt = db->db_data;
-
- bte.be_birth_txg = birth_txg;
- bte.be_bp = *bp;
- dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
-
- dmu_buf_will_dirty(db, tx);
- bt->bt_end++;
- bt->bt_bytes += bytes;
- bt->bt_comp += comp;
- bt->bt_uncomp += uncomp;
- dmu_buf_rele(db, FTAG);
-}
-
-/* ARGSUSED */
-static int
-bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
- int err;
- struct bptree_args *ba = arg;
-
- if (bp == NULL || BP_IS_HOLE(bp))
- return (0);
-
- err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
- if (err == 0 && ba->ba_free) {
- ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
- ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
- ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
- }
- return (err);
-}
-
-/*
- * If "free" is set:
- * - It is assumed that "func" will be freeing the block pointers.
- * - If "func" returns nonzero, the bookmark will be remembered and
- * iteration will be restarted from this point on next invocation.
- * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
- * bptree_iterate will remember the bookmark, continue traversing
- * any additional entries, and return 0.
- *
- * If "free" is not set, traversal will stop and return an error if
- * an i/o error is encountered.
- *
- * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
- * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
- * traverse_dataset_destroyed()).
- */
-int
-bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
- void *arg, dmu_tx_t *tx)
-{
- boolean_t ioerr = B_FALSE;
- int err;
- uint64_t i;
- dmu_buf_t *db;
- struct bptree_args ba;
-
- ASSERT(!free || dmu_tx_is_syncing(tx));
-
- err = dmu_bonus_hold(os, obj, FTAG, &db);
- if (err != 0)
- return (err);
-
- if (free)
- dmu_buf_will_dirty(db, tx);
-
- ba.ba_phys = db->db_data;
- ba.ba_free = free;
- ba.ba_func = func;
- ba.ba_arg = arg;
- ba.ba_tx = tx;
-
- err = 0;
- for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
- bptree_entry_phys_t bte;
- int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
-
- err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
- &bte, DMU_READ_NO_PREFETCH);
- if (err != 0)
- break;
-
- if (zfs_free_leak_on_eio)
- flags |= TRAVERSE_HARD;
- zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
- "bookmark %lld/%lld/%lld/%lld",
- (longlong_t)i,
- (longlong_t)bte.be_birth_txg,
- (longlong_t)bte.be_zb.zb_objset,
- (longlong_t)bte.be_zb.zb_object,
- (longlong_t)bte.be_zb.zb_level,
- (longlong_t)bte.be_zb.zb_blkid);
- err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
- bte.be_birth_txg, &bte.be_zb, flags,
- bptree_visit_cb, &ba);
- if (free) {
- /*
- * The callback has freed the visited block pointers.
- * Record our traversal progress on disk, either by
- * updating this record's bookmark, or by logically
- * removing this record by advancing bt_begin.
- */
- if (err != 0) {
- /* save bookmark for future resume */
- ASSERT3U(bte.be_zb.zb_objset, ==,
- ZB_DESTROYED_OBJSET);
- ASSERT0(bte.be_zb.zb_level);
- dmu_write(os, obj, i * sizeof (bte),
- sizeof (bte), &bte, tx);
- if (err == EIO || err == ECKSUM ||
- err == ENXIO) {
- /*
- * Skip the rest of this tree and
- * continue on to the next entry.
- */
- err = 0;
- ioerr = B_TRUE;
- } else {
- break;
- }
- } else if (ioerr) {
- /*
- * This entry is finished, but there were
- * i/o errors on previous entries, so we
- * can't adjust bt_begin. Set this entry's
- * be_birth_txg such that it will be
- * treated as a no-op in future traversals.
- */
- bte.be_birth_txg = UINT64_MAX;
- dmu_write(os, obj, i * sizeof (bte),
- sizeof (bte), &bte, tx);
- }
-
- if (!ioerr) {
- ba.ba_phys->bt_begin++;
- (void) dmu_free_range(os, obj,
- i * sizeof (bte), sizeof (bte), tx);
- }
- } else if (err != 0) {
- break;
- }
- }
-
- ASSERT(!free || err != 0 || ioerr ||
- ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
-
- /* if all blocks are free there should be no used space */
- if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
- if (zfs_free_leak_on_eio) {
- ba.ba_phys->bt_bytes = 0;
- ba.ba_phys->bt_comp = 0;
- ba.ba_phys->bt_uncomp = 0;
- }
-
- ASSERT0(ba.ba_phys->bt_bytes);
- ASSERT0(ba.ba_phys->bt_comp);
- ASSERT0(ba.ba_phys->bt_uncomp);
- }
-
- dmu_buf_rele(db, FTAG);
-
- return (err);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
deleted file mode 100644
index 1ddc697b5424..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- */
-
-#include <sys/bqueue.h>
-#include <sys/zfs_context.h>
-
-static inline bqueue_node_t *
-obj2node(bqueue_t *q, void *data)
-{
- return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
-}
-
-/*
- * Initialize a blocking queue The maximum capacity of the queue is set to
- * size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
- * and offset should give its offset from the start of the struct. Return 0 on
- * success, or -1 on failure.
- */
-int
-bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
-{
- list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
- node_offset + offsetof(bqueue_node_t, bqn_node));
- cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
- q->bq_node_offset = node_offset;
- q->bq_size = 0;
- q->bq_maxsize = size;
- return (0);
-}
-
-/*
- * Destroy a blocking queue. This function asserts that there are no
- * elements in the queue, and no one is blocked on the condition
- * variables.
- */
-void
-bqueue_destroy(bqueue_t *q)
-{
- ASSERT0(q->bq_size);
- cv_destroy(&q->bq_add_cv);
- cv_destroy(&q->bq_pop_cv);
- mutex_destroy(&q->bq_lock);
- list_destroy(&q->bq_list);
-}
-
-/*
- * Add data to q, consuming size units of capacity. If there is insufficient
- * capacity to consume size units, block until capacity exists. Asserts size is
- * > 0.
- */
-void
-bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
-{
- ASSERT3U(item_size, >, 0);
- ASSERT3U(item_size, <, q->bq_maxsize);
- mutex_enter(&q->bq_lock);
- obj2node(q, data)->bqn_size = item_size;
- while (q->bq_size + item_size > q->bq_maxsize) {
- cv_wait(&q->bq_add_cv, &q->bq_lock);
- }
- q->bq_size += item_size;
- list_insert_tail(&q->bq_list, data);
- cv_signal(&q->bq_pop_cv);
- mutex_exit(&q->bq_lock);
-}
-/*
- * Take the first element off of q. If there are no elements on the queue, wait
- * until one is put there. Return the removed element.
- */
-void *
-bqueue_dequeue(bqueue_t *q)
-{
- void *ret;
- uint64_t item_size;
- mutex_enter(&q->bq_lock);
- while (q->bq_size == 0) {
- cv_wait(&q->bq_pop_cv, &q->bq_lock);
- }
- ret = list_remove_head(&q->bq_list);
- item_size = obj2node(q, ret)->bqn_size;
- q->bq_size -= item_size;
- mutex_exit(&q->bq_lock);
- cv_signal(&q->bq_add_cv);
- return (ret);
-}
-
-/*
- * Returns true if the space used is 0.
- */
-boolean_t
-bqueue_empty(bqueue_t *q)
-{
- return (q->bq_size == 0);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
deleted file mode 100644
index 2b62edad0342..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2011 Google, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/cityhash.h>
-
-#define HASH_K1 0xb492b66fbe98f273ULL
-#define HASH_K2 0x9ae16a3b2f90404fULL
-
-/*
- * Bitwise right rotate. Normally this will compile to a single
- * instruction.
- */
-static inline uint64_t
-rotate(uint64_t val, int shift)
-{
- // Avoid shifting by 64: doing so yields an undefined result.
- return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
-}
-
-static inline uint64_t
-cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
-{
- uint64_t a = (u ^ v) * mul;
- a ^= (a >> 47);
- uint64_t b = (v ^ a) * mul;
- b ^= (b >> 47);
- b *= mul;
- return (b);
-}
-
-uint64_t
-cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
-{
- uint64_t mul = HASH_K2 + 64;
- uint64_t a = w1 * HASH_K1;
- uint64_t b = w2;
- uint64_t c = w4 * mul;
- uint64_t d = w3 * HASH_K2;
- return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
- a + rotate(b + HASH_K2, 18) + c, mul));
-
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
deleted file mode 100644
index 1974ff2197c2..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ /dev/null
@@ -1,4248 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_send.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/zfeature.h>
-#include <sys/blkptr.h>
-#include <sys/range_tree.h>
-#include <sys/callb.h>
-#include <sys/abd.h>
-#include <sys/vdev.h>
-#include <sys/cityhash.h>
-#include <sys/spa_impl.h>
-
-kstat_t *dbuf_ksp;
-
-typedef struct dbuf_stats {
- /*
- * Various statistics about the size of the dbuf cache.
- */
- kstat_named_t cache_count;
- kstat_named_t cache_size_bytes;
- kstat_named_t cache_size_bytes_max;
- /*
- * Statistics regarding the bounds on the dbuf cache size.
- */
- kstat_named_t cache_target_bytes;
- kstat_named_t cache_lowater_bytes;
- kstat_named_t cache_hiwater_bytes;
- /*
- * Total number of dbuf cache evictions that have occurred.
- */
- kstat_named_t cache_total_evicts;
- /*
- * The distribution of dbuf levels in the dbuf cache and
- * the total size of all dbufs at each level.
- */
- kstat_named_t cache_levels[DN_MAX_LEVELS];
- kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
- /*
- * Statistics about the dbuf hash table.
- */
- kstat_named_t hash_hits;
- kstat_named_t hash_misses;
- kstat_named_t hash_collisions;
- kstat_named_t hash_elements;
- kstat_named_t hash_elements_max;
- /*
- * Number of sublists containing more than one dbuf in the dbuf
- * hash table. Keep track of the longest hash chain.
- */
- kstat_named_t hash_chains;
- kstat_named_t hash_chain_max;
- /*
- * Number of times a dbuf_create() discovers that a dbuf was
- * already created and in the dbuf hash table.
- */
- kstat_named_t hash_insert_race;
- /*
- * Statistics about the size of the metadata dbuf cache.
- */
- kstat_named_t metadata_cache_count;
- kstat_named_t metadata_cache_size_bytes;
- kstat_named_t metadata_cache_size_bytes_max;
- /*
- * For diagnostic purposes, this is incremented whenever we can't add
- * something to the metadata cache because it's full, and instead put
- * the data in the regular dbuf cache.
- */
- kstat_named_t metadata_cache_overflow;
-} dbuf_stats_t;
-
-dbuf_stats_t dbuf_stats = {
- { "cache_count", KSTAT_DATA_UINT64 },
- { "cache_size_bytes", KSTAT_DATA_UINT64 },
- { "cache_size_bytes_max", KSTAT_DATA_UINT64 },
- { "cache_target_bytes", KSTAT_DATA_UINT64 },
- { "cache_lowater_bytes", KSTAT_DATA_UINT64 },
- { "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
- { "cache_total_evicts", KSTAT_DATA_UINT64 },
- { { "cache_levels_N", KSTAT_DATA_UINT64 } },
- { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
- { "hash_hits", KSTAT_DATA_UINT64 },
- { "hash_misses", KSTAT_DATA_UINT64 },
- { "hash_collisions", KSTAT_DATA_UINT64 },
- { "hash_elements", KSTAT_DATA_UINT64 },
- { "hash_elements_max", KSTAT_DATA_UINT64 },
- { "hash_chains", KSTAT_DATA_UINT64 },
- { "hash_chain_max", KSTAT_DATA_UINT64 },
- { "hash_insert_race", KSTAT_DATA_UINT64 },
- { "metadata_cache_count", KSTAT_DATA_UINT64 },
- { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
- { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
- { "metadata_cache_overflow", KSTAT_DATA_UINT64 }
-};
-
-#define DBUF_STAT_INCR(stat, val) \
- atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
-#define DBUF_STAT_DECR(stat, val) \
- DBUF_STAT_INCR(stat, -(val));
-#define DBUF_STAT_BUMP(stat) \
- DBUF_STAT_INCR(stat, 1);
-#define DBUF_STAT_BUMPDOWN(stat) \
- DBUF_STAT_INCR(stat, -1);
-#define DBUF_STAT_MAX(stat, v) { \
- uint64_t _m; \
- while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
- (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
- continue; \
-}
-
-struct dbuf_hold_impl_data {
- /* Function arguments */
- dnode_t *dh_dn;
- uint8_t dh_level;
- uint64_t dh_blkid;
- boolean_t dh_fail_sparse;
- boolean_t dh_fail_uncached;
- void *dh_tag;
- dmu_buf_impl_t **dh_dbp;
- /* Local variables */
- dmu_buf_impl_t *dh_db;
- dmu_buf_impl_t *dh_parent;
- blkptr_t *dh_bp;
- int dh_err;
- dbuf_dirty_record_t *dh_dr;
- int dh_depth;
-};
-
-static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
- dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
- boolean_t fail_uncached,
- void *tag, dmu_buf_impl_t **dbp, int depth);
-static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
-
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-
-/*
- * Global data structures and functions for the dbuf cache.
- */
-static kmem_cache_t *dbuf_kmem_cache;
-static taskq_t *dbu_evict_taskq;
-
-static kthread_t *dbuf_cache_evict_thread;
-static kmutex_t dbuf_evict_lock;
-static kcondvar_t dbuf_evict_cv;
-static boolean_t dbuf_evict_thread_exit;
-
-/*
- * There are two dbuf caches; each dbuf can only be in one of them at a time.
- *
- * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
- * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
- * that represent the metadata that describes filesystems/snapshots/
- * bookmarks/properties/etc. We only evict from this cache when we export a
- * pool, to short-circuit as much I/O as possible for all administrative
- * commands that need the metadata. There is no eviction policy for this
- * cache, because we try to only include types in it which would occupy a
- * very small amount of space per object but create a large impact on the
- * performance of these commands. Instead, after it reaches a maximum size
- * (which should only happen on very small memory systems with a very large
- * number of filesystem objects), we stop taking new dbufs into the
- * metadata cache, instead putting them in the normal dbuf cache.
- *
- * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
- *
- * Dbufs are added to these caches once the last hold is released. If a dbuf is
- * later accessed and still exists in the dbuf cache, then it will be removed
- * from the cache and later re-added to the head of the cache.
- *
- * If a given dbuf meets the requirements for the metadata cache, it will go
- * there, otherwise it will be considered for the generic LRU dbuf cache. The
- * caches and the refcounts tracking their sizes are stored in an array indexed
- * by those caches' matching enum values (from dbuf_cached_state_t).
- */
-typedef struct dbuf_cache {
- multilist_t *cache;
- zfs_refcount_t size;
-} dbuf_cache_t;
-dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
-
-/* Size limits for the caches */
-uint64_t dbuf_cache_max_bytes = 0;
-uint64_t dbuf_metadata_cache_max_bytes = 0;
-/* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
-
-/*
- * For diagnostic purposes, this is incremented whenever we can't add
- * something to the metadata cache because it's full, and instead put
- * the data in the regular dbuf cache.
- */
-uint64_t dbuf_metadata_cache_overflow;
-
-/*
- * The LRU dbuf cache uses a three-stage eviction policy:
- * - A low water marker designates when the dbuf eviction thread
- * should stop evicting from the dbuf cache.
- * - When we reach the maximum size (aka mid water mark), we
- * signal the eviction thread to run.
- * - The high water mark indicates when the eviction thread
- * is unable to keep up with the incoming load and eviction must
- * happen in the context of the calling thread.
- *
- * The dbuf cache:
- * (max size)
- * low water mid water hi water
- * +----------------------------------------+----------+----------+
- * | | | |
- * | | | |
- * | | | |
- * | | | |
- * +----------------------------------------+----------+----------+
- * stop signal evict
- * evicting eviction directly
- * thread
- *
- * The high and low water marks indicate the operating range for the eviction
- * thread. The low water mark is, by default, 90% of the total size of the
- * cache and the high water mark is at 110% (both of these percentages can be
- * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
- * respectively). The eviction thread will try to ensure that the cache remains
- * within this range by waking up every second and checking if the cache is
- * above the low water mark. The thread can also be woken up by callers adding
- * elements into the cache if the cache is larger than the mid water (i.e max
- * cache size). Once the eviction thread is woken up and eviction is required,
- * it will continue evicting buffers until it's able to reduce the cache size
- * to the low water mark. If the cache size continues to grow and hits the high
- * water mark, then callers adding elments to the cache will begin to evict
- * directly from the cache until the cache is no longer above the high water
- * mark.
- */
-
-/*
- * The percentage above and below the maximum cache size.
- */
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
- &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN,
- &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN,
- &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN,
- &dbuf_metadata_cache_shift, 0,
- "dbuf metadata cache size as log2 fraction of ARC");
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD,
- &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
- &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
- &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");
-
-/* ARGSUSED */
-static int
-dbuf_cons(void *vdb, void *unused, int kmflag)
-{
- dmu_buf_impl_t *db = vdb;
- bzero(db, sizeof (dmu_buf_impl_t));
-
- mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
- multilist_link_init(&db->db_cache_link);
- zfs_refcount_create(&db->db_holds);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dbuf_dest(void *vdb, void *unused)
-{
- dmu_buf_impl_t *db = vdb;
- mutex_destroy(&db->db_mtx);
- cv_destroy(&db->db_changed);
- ASSERT(!multilist_link_active(&db->db_cache_link));
- zfs_refcount_destroy(&db->db_holds);
-}
-
-/*
- * dbuf hash table routines
- */
-static dbuf_hash_table_t dbuf_hash_table;
-
-static uint64_t dbuf_hash_count;
-
-/*
- * We use Cityhash for this. It's fast, and has good hash properties without
- * requiring any large static buffers.
- */
-static uint64_t
-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
-{
- return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
-}
-
-#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
- ((dbuf)->db.db_object == (obj) && \
- (dbuf)->db_objset == (os) && \
- (dbuf)->db_level == (level) && \
- (dbuf)->db_blkid == (blkid))
-
-dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv = dbuf_hash(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *db;
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
- if (DBUF_EQUAL(db, os, obj, level, blkid)) {
- mutex_enter(&db->db_mtx);
- if (db->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (db);
- }
- mutex_exit(&db->db_mtx);
- }
- }
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (NULL);
-}
-
-static dmu_buf_impl_t *
-dbuf_find_bonus(objset_t *os, uint64_t object)
-{
- dnode_t *dn;
- dmu_buf_impl_t *db = NULL;
-
- if (dnode_hold(os, object, FTAG, &dn) == 0) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_bonus != NULL) {
- db = dn->dn_bonus;
- mutex_enter(&db->db_mtx);
- }
- rw_exit(&dn->dn_struct_rwlock);
- dnode_rele(dn, FTAG);
- }
- return (db);
-}
-
-/*
- * Insert an entry into the hash table. If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static dmu_buf_impl_t *
-dbuf_hash_insert(dmu_buf_impl_t *db)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_t *os = db->db_objset;
- uint64_t obj = db->db.db_object;
- int level = db->db_level;
- uint64_t blkid, hv, idx;
- dmu_buf_impl_t *dbf;
- uint32_t i;
-
- blkid = db->db_blkid;
- hv = dbuf_hash(os, obj, level, blkid);
- idx = hv & h->hash_table_mask;
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
- dbf = dbf->db_hash_next, i++) {
- if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
- mutex_enter(&dbf->db_mtx);
- if (dbf->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (dbf);
- }
- mutex_exit(&dbf->db_mtx);
- }
- }
-
- if (i > 0) {
- DBUF_STAT_BUMP(hash_collisions);
- if (i == 1)
- DBUF_STAT_BUMP(hash_chains);
-
- DBUF_STAT_MAX(hash_chain_max, i);
- }
-
- mutex_enter(&db->db_mtx);
- db->db_hash_next = h->hash_table[idx];
- h->hash_table[idx] = db;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_inc_64(&dbuf_hash_count);
- DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
-
- return (NULL);
-}
-
-/*
- * Remove an entry from the hash table. It must be in the EVICTING state.
- */
-static void
-dbuf_hash_remove(dmu_buf_impl_t *db)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv, idx;
- dmu_buf_impl_t *dbf, **dbp;
-
- hv = dbuf_hash(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- idx = hv & h->hash_table_mask;
-
- /*
- * We mustn't hold db_mtx to maintain lock ordering:
- * DBUF_HASH_MUTEX > db_mtx.
- */
- ASSERT(zfs_refcount_is_zero(&db->db_holds));
- ASSERT(db->db_state == DB_EVICTING);
- ASSERT(!MUTEX_HELD(&db->db_mtx));
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- dbp = &h->hash_table[idx];
- while ((dbf = *dbp) != db) {
- dbp = &dbf->db_hash_next;
- ASSERT(dbf != NULL);
- }
- *dbp = db->db_hash_next;
- db->db_hash_next = NULL;
- if (h->hash_table[idx] &&
- h->hash_table[idx]->db_hash_next == NULL)
- DBUF_STAT_BUMPDOWN(hash_chains);
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_dec_64(&dbuf_hash_count);
-}
-
-typedef enum {
- DBVU_EVICTING,
- DBVU_NOT_EVICTING
-} dbvu_verify_type_t;
-
-static void
-dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
-{
-#ifdef ZFS_DEBUG
- int64_t holds;
-
- if (db->db_user == NULL)
- return;
-
- /* Only data blocks support the attachment of user data. */
- ASSERT(db->db_level == 0);
-
- /* Clients must resolve a dbuf before attaching user data. */
- ASSERT(db->db.db_data != NULL);
- ASSERT3U(db->db_state, ==, DB_CACHED);
-
- holds = zfs_refcount_count(&db->db_holds);
- if (verify_type == DBVU_EVICTING) {
- /*
- * Immediate eviction occurs when holds == dirtycnt.
- * For normal eviction buffers, holds is zero on
- * eviction, except when dbuf_fix_old_data() calls
- * dbuf_clear_data(). However, the hold count can grow
- * during eviction even though db_mtx is held (see
- * dmu_bonus_hold() for an example), so we can only
- * test the generic invariant that holds >= dirtycnt.
- */
- ASSERT3U(holds, >=, db->db_dirtycnt);
- } else {
- if (db->db_user_immediate_evict == TRUE)
- ASSERT3U(holds, >=, db->db_dirtycnt);
- else
- ASSERT3U(holds, >, 0);
- }
-#endif
-}
-
-static void
-dbuf_evict_user(dmu_buf_impl_t *db)
-{
- dmu_buf_user_t *dbu = db->db_user;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (dbu == NULL)
- return;
-
- dbuf_verify_user(db, DBVU_EVICTING);
- db->db_user = NULL;
-
-#ifdef ZFS_DEBUG
- if (dbu->dbu_clear_on_evict_dbufp != NULL)
- *dbu->dbu_clear_on_evict_dbufp = NULL;
-#endif
-
- /*
- * There are two eviction callbacks - one that we call synchronously
- * and one that we invoke via a taskq. The async one is useful for
- * avoiding lock order reversals and limiting stack depth.
- *
- * Note that if we have a sync callback but no async callback,
- * it's likely that the sync callback will free the structure
- * containing the dbu. In that case we need to take care to not
- * dereference dbu after calling the sync evict func.
- */
- boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
-
- if (dbu->dbu_evict_func_sync != NULL)
- dbu->dbu_evict_func_sync(dbu);
-
- if (has_async) {
- taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
- dbu, 0, &dbu->dbu_tqent);
- }
-}
-
-boolean_t
-dbuf_is_metadata(dmu_buf_impl_t *db)
-{
- if (db->db_level > 0) {
- return (B_TRUE);
- } else {
- boolean_t is_metadata;
-
- DB_DNODE_ENTER(db);
- is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
- DB_DNODE_EXIT(db);
-
- return (is_metadata);
- }
-}
-
-/*
- * This returns whether this dbuf should be stored in the metadata cache, which
- * is based on whether it's from one of the dnode types that store data related
- * to traversing dataset hierarchies.
- */
-static boolean_t
-dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
-{
- DB_DNODE_ENTER(db);
- dmu_object_type_t type = DB_DNODE(db)->dn_type;
- DB_DNODE_EXIT(db);
-
- /* Check if this dbuf is one of the types we care about */
- if (DMU_OT_IS_METADATA_CACHED(type)) {
- /* If we hit this, then we set something up wrong in dmu_ot */
- ASSERT(DMU_OT_IS_METADATA(type));
-
- /*
- * Sanity check for small-memory systems: don't allocate too
- * much memory for this purpose.
- */
- if (zfs_refcount_count(
- &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
- dbuf_metadata_cache_max_bytes) {
- dbuf_metadata_cache_overflow++;
- DTRACE_PROBE1(dbuf__metadata__cache__overflow,
- dmu_buf_impl_t *, db);
- return (B_FALSE);
- }
-
- return (B_TRUE);
- }
-
- return (B_FALSE);
-}
-
-/*
- * This function *must* return indices evenly distributed between all
- * sublists of the multilist. This is needed due to how the dbuf eviction
- * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
- * distributed between all sublists and uses this assumption when
- * deciding which sublist to evict from and how much to evict from it.
- */
-unsigned int
-dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
-{
- dmu_buf_impl_t *db = obj;
-
- /*
- * The assumption here, is the hash value for a given
- * dmu_buf_impl_t will remain constant throughout it's lifetime
- * (i.e. it's objset, object, level and blkid fields don't change).
- * Thus, we don't need to store the dbuf's sublist index
- * on insertion, as this index can be recalculated on removal.
- *
- * Also, the low order bits of the hash value are thought to be
- * distributed evenly. Otherwise, in the case that the multilist
- * has a power of two number of sublists, each sublists' usage
- * would not be evenly distributed.
- */
- return (dbuf_hash(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid) %
- multilist_get_num_sublists(ml));
-}
-
-static inline unsigned long
-dbuf_cache_target_bytes(void)
-{
- return MIN(dbuf_cache_max_bytes,
- arc_max_bytes() >> dbuf_cache_shift);
-}
-
-static inline uint64_t
-dbuf_cache_hiwater_bytes(void)
-{
- uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
- return (dbuf_cache_target +
- (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
-}
-
-static inline uint64_t
-dbuf_cache_lowater_bytes(void)
-{
- uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
- return (dbuf_cache_target -
- (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
-}
-
-static inline boolean_t
-dbuf_cache_above_lowater(void)
-{
- return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
- dbuf_cache_lowater_bytes());
-}
-
-/*
- * Evict the oldest eligible dbuf from the dbuf cache.
- */
-static void
-dbuf_evict_one(void)
-{
- int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
- multilist_sublist_t *mls = multilist_sublist_lock(
- dbuf_caches[DB_DBUF_CACHE].cache, idx);
-
- ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
-
- dmu_buf_impl_t *db = multilist_sublist_tail(mls);
- while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
- db = multilist_sublist_prev(mls, db);
- }
-
- DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
- multilist_sublist_t *, mls);
-
- if (db != NULL) {
- multilist_sublist_remove(mls, db);
- multilist_sublist_unlock(mls);
- (void) zfs_refcount_remove_many(
- &dbuf_caches[DB_DBUF_CACHE].size,
- db->db.db_size, db);
- DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
- DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
- ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
- db->db_caching_status = DB_NO_CACHE;
- dbuf_destroy(db);
- DBUF_STAT_BUMP(cache_total_evicts);
- } else {
- multilist_sublist_unlock(mls);
- }
-}
-
-/*
- * The dbuf evict thread is responsible for aging out dbufs from the
- * cache. Once the cache has reached it's maximum size, dbufs are removed
- * and destroyed. The eviction thread will continue running until the size
- * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
- * out of the cache it is destroyed and becomes eligible for arc eviction.
- */
-/* ARGSUSED */
-static void
-dbuf_evict_thread(void *unused __unused)
-{
- callb_cpr_t cpr;
-
- CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
-
- mutex_enter(&dbuf_evict_lock);
- while (!dbuf_evict_thread_exit) {
- while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_hires(&dbuf_evict_cv,
- &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
- CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
-#ifdef __FreeBSD__
- if (dbuf_ksp != NULL)
- dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ);
-#endif
- }
- mutex_exit(&dbuf_evict_lock);
-
- /*
- * Keep evicting as long as we're above the low water mark
- * for the cache. We do this without holding the locks to
- * minimize lock contention.
- */
- while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
- dbuf_evict_one();
- }
-
- mutex_enter(&dbuf_evict_lock);
- }
-
- dbuf_evict_thread_exit = B_FALSE;
- cv_broadcast(&dbuf_evict_cv);
- CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
- thread_exit();
-}
-
-/*
- * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
- * If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
- */
-static void
-dbuf_evict_notify(uint64_t size)
-{
- /*
- * We check if we should evict without holding the dbuf_evict_lock,
- * because it's OK to occasionally make the wrong decision here,
- * and grabbing the lock results in massive lock contention.
- */
- if (size > dbuf_cache_max_bytes) {
- if (size > dbuf_cache_hiwater_bytes())
- dbuf_evict_one();
- cv_signal(&dbuf_evict_cv);
- }
-}
-
-static int
-dbuf_kstat_update(kstat_t *ksp, int rw)
-{
- dbuf_stats_t *ds = ksp->ks_data;
-
- if (rw == KSTAT_WRITE) {
- return (SET_ERROR(EACCES));
- } else {
- ds->metadata_cache_size_bytes.value.ui64 =
- zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
- ds->cache_size_bytes.value.ui64 =
- zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
- ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
- ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
- ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
- ds->hash_elements.value.ui64 = dbuf_hash_count;
- }
-
- return (0);
-}
-
-void
-dbuf_init(void)
-{
- uint64_t hsize = 1ULL << 16;
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average 4K block size. The table will take up
- * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
- */
- while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
- hsize <<= 1;
-
-retry:
- h->hash_table_mask = hsize - 1;
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
- if (h->hash_table == NULL) {
- /* XXX - we should really return an error instead of assert */
- ASSERT(hsize > (1ULL << 10));
- hsize >>= 1;
- goto retry;
- }
-
- dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
- sizeof (dmu_buf_impl_t),
- 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
-
- dbuf_stats_init(h);
- /*
- * Setup the parameters for the dbuf caches. We set the sizes of the
- * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
- * of the size of the ARC, respectively. If the values are set in
- * /etc/system and they're not greater than the size of the ARC, then
- * we honor that value.
- */
- if (dbuf_cache_max_bytes == 0 ||
- dbuf_cache_max_bytes >= arc_max_bytes()) {
- dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
- }
- if (dbuf_metadata_cache_max_bytes == 0 ||
- dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
- dbuf_metadata_cache_max_bytes =
- arc_max_bytes() >> dbuf_metadata_cache_shift;
- }
-
- /*
- * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
- * configuration is not required.
- */
- dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
-
- for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
- dbuf_caches[dcs].cache =
- multilist_create(sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_cache_link),
- dbuf_cache_multilist_index_func);
- zfs_refcount_create(&dbuf_caches[dcs].size);
- }
-
- dbuf_evict_thread_exit = B_FALSE;
- mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
- dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
- NULL, 0, &p0, TS_RUN, minclsyspri);
-
- dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
- KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
- if (dbuf_ksp != NULL) {
- for (i = 0; i < DN_MAX_LEVELS; i++) {
- snprintf(dbuf_stats.cache_levels[i].name,
- KSTAT_STRLEN, "cache_level_%d", i);
- dbuf_stats.cache_levels[i].data_type =
- KSTAT_DATA_UINT64;
- snprintf(dbuf_stats.cache_levels_bytes[i].name,
- KSTAT_STRLEN, "cache_level_%d_bytes", i);
- dbuf_stats.cache_levels_bytes[i].data_type =
- KSTAT_DATA_UINT64;
- }
- dbuf_ksp->ks_data = &dbuf_stats;
- dbuf_ksp->ks_update = dbuf_kstat_update;
- kstat_install(dbuf_ksp);
- }
-}
-
-void
-dbuf_fini(void)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- dbuf_stats_destroy();
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_destroy(&h->hash_mutexes[i]);
- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
- kmem_cache_destroy(dbuf_kmem_cache);
- taskq_destroy(dbu_evict_taskq);
-
- mutex_enter(&dbuf_evict_lock);
- dbuf_evict_thread_exit = B_TRUE;
- while (dbuf_evict_thread_exit) {
- cv_signal(&dbuf_evict_cv);
- cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
- }
- mutex_exit(&dbuf_evict_lock);
-
- mutex_destroy(&dbuf_evict_lock);
- cv_destroy(&dbuf_evict_cv);
-
- for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
- zfs_refcount_destroy(&dbuf_caches[dcs].size);
- multilist_destroy(dbuf_caches[dcs].cache);
- }
-
- if (dbuf_ksp != NULL) {
- kstat_delete(dbuf_ksp);
- dbuf_ksp = NULL;
- }
-}
-
-/*
- * Other stuff.
- */
-
-#ifdef ZFS_DEBUG
-static void
-dbuf_verify(dmu_buf_impl_t *db)
-{
- dnode_t *dn;
- dbuf_dirty_record_t *dr;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
- return;
-
- ASSERT(db->db_objset != NULL);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- if (dn == NULL) {
- ASSERT(db->db_parent == NULL);
- ASSERT(db->db_blkptr == NULL);
- } else {
- ASSERT3U(db->db.db_object, ==, dn->dn_object);
- ASSERT3P(db->db_objset, ==, dn->dn_objset);
- ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
- db->db_blkid == DMU_SPILL_BLKID ||
- !avl_is_empty(&dn->dn_dbufs));
- }
- if (db->db_blkid == DMU_BONUS_BLKID) {
- ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
- } else if (db->db_blkid == DMU_SPILL_BLKID) {
- ASSERT(dn != NULL);
- ASSERT0(db->db.db_offset);
- } else {
- ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
- }
-
- for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
- ASSERT(dr->dr_dbuf == db);
-
- for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
- ASSERT(dr->dr_dbuf == db);
-
- /*
- * We can't assert that db_size matches dn_datablksz because it
- * can be momentarily different when another thread is doing
- * dnode_set_blksz().
- */
- if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
- dr = db->db_data_pending;
- /*
- * It should only be modified in syncing context, so
- * make sure we only have one copy of the data.
- */
- ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
- }
-
- /* verify db->db_blkptr */
- if (db->db_blkptr) {
- if (db->db_parent == dn->dn_dbuf) {
- /* db is pointed to by the dnode */
- /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
- if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
- ASSERT(db->db_parent == NULL);
- else
- ASSERT(db->db_parent != NULL);
- if (db->db_blkid != DMU_SPILL_BLKID)
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- } else {
- /* db is pointed to by an indirect block */
- int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
- ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
- ASSERT3U(db->db_parent->db.db_object, ==,
- db->db.db_object);
- /*
- * dnode_grow_indblksz() can make this fail if we don't
- * have the struct_rwlock. XXX indblksz no longer
- * grows. safe to do this now?
- */
- if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- ASSERT3P(db->db_blkptr, ==,
- ((blkptr_t *)db->db_parent->db.db_data +
- db->db_blkid % epb));
- }
- }
- }
- if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
- (db->db_buf == NULL || db->db_buf->b_data) &&
- db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
- db->db_state != DB_FILL && !dn->dn_free_txg) {
- /*
- * If the blkptr isn't set but they have nonzero data,
- * it had better be dirty, otherwise we'll lose that
- * data when we evict this buffer.
- *
- * There is an exception to this rule for indirect blocks; in
- * this case, if the indirect block is a hole, we fill in a few
- * fields on each of the child blocks (importantly, birth time)
- * to prevent hole birth times from being lost when you
- * partially fill in a hole.
- */
- if (db->db_dirtycnt == 0) {
- if (db->db_level == 0) {
- uint64_t *buf = db->db.db_data;
- int i;
-
- for (i = 0; i < db->db.db_size >> 3; i++) {
- ASSERT(buf[i] == 0);
- }
- } else {
- blkptr_t *bps = db->db.db_data;
- ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
- db->db.db_size);
- /*
- * We want to verify that all the blkptrs in the
- * indirect block are holes, but we may have
- * automatically set up a few fields for them.
- * We iterate through each blkptr and verify
- * they only have those fields set.
- */
- for (int i = 0;
- i < db->db.db_size / sizeof (blkptr_t);
- i++) {
- blkptr_t *bp = &bps[i];
- ASSERT(ZIO_CHECKSUM_IS_ZERO(
- &bp->blk_cksum));
- ASSERT(
- DVA_IS_EMPTY(&bp->blk_dva[0]) &&
- DVA_IS_EMPTY(&bp->blk_dva[1]) &&
- DVA_IS_EMPTY(&bp->blk_dva[2]));
- ASSERT0(bp->blk_fill);
- ASSERT0(bp->blk_pad[0]);
- ASSERT0(bp->blk_pad[1]);
- ASSERT(!BP_IS_EMBEDDED(bp));
- ASSERT(BP_IS_HOLE(bp));
- ASSERT0(bp->blk_phys_birth);
- }
- }
- }
- }
- DB_DNODE_EXIT(db);
-}
-#endif
-
-static void
-dbuf_clear_data(dmu_buf_impl_t *db)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- dbuf_evict_user(db);
- ASSERT3P(db->db_buf, ==, NULL);
- db->db.db_data = NULL;
- if (db->db_state != DB_NOFILL)
- db->db_state = DB_UNCACHED;
-}
-
-static void
-dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(buf != NULL);
-
- db->db_buf = buf;
- ASSERT(buf->b_data != NULL);
- db->db.db_data = buf->b_data;
-}
-
-/*
- * Loan out an arc_buf for read. Return the loaned arc_buf.
- */
-arc_buf_t *
-dbuf_loan_arcbuf(dmu_buf_impl_t *db)
-{
- arc_buf_t *abuf;
-
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- mutex_enter(&db->db_mtx);
- if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
- int blksz = db->db.db_size;
- spa_t *spa = db->db_objset->os_spa;
-
- mutex_exit(&db->db_mtx);
- abuf = arc_loan_buf(spa, B_FALSE, blksz);
- bcopy(db->db.db_data, abuf->b_data, blksz);
- } else {
- abuf = db->db_buf;
- arc_loan_inuse_buf(abuf, db);
- db->db_buf = NULL;
- dbuf_clear_data(db);
- mutex_exit(&db->db_mtx);
- }
- return (abuf);
-}
-
-/*
- * Calculate which level n block references the data at the level 0 offset
- * provided.
- */
-uint64_t
-dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
-{
- if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
- /*
- * The level n blkid is equal to the level 0 blkid divided by
- * the number of level 0s in a level n block.
- *
- * The level 0 blkid is offset >> datablkshift =
- * offset / 2^datablkshift.
- *
- * The number of level 0s in a level n is the number of block
- * pointers in an indirect block, raised to the power of level.
- * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
- * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
- *
- * Thus, the level n blkid is: offset /
- * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
- * = offset / 2^(datablkshift + level *
- * (indblkshift - SPA_BLKPTRSHIFT))
- * = offset >> (datablkshift + level *
- * (indblkshift - SPA_BLKPTRSHIFT))
- */
- return (offset >> (dn->dn_datablkshift + level *
- (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
- } else {
- ASSERT3U(offset, <, dn->dn_datablksz);
- return (0);
- }
-}
-
-static void
-dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
- arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
-
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db_state, ==, DB_READ);
- /*
- * All reads are synchronous, so we must have a hold on the dbuf
- */
- ASSERT(zfs_refcount_count(&db->db_holds) > 0);
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- if (buf == NULL) {
- /* i/o error */
- ASSERT(zio == NULL || zio->io_error != 0);
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT3P(db->db_buf, ==, NULL);
- db->db_state = DB_UNCACHED;
- } else if (db->db_level == 0 && db->db_freed_in_flight) {
- /* freed in flight */
- ASSERT(zio == NULL || zio->io_error == 0);
- if (buf == NULL) {
- buf = arc_alloc_buf(db->db_objset->os_spa,
- db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
- }
- arc_release(buf, db);
- bzero(buf->b_data, db->db.db_size);
- arc_buf_freeze(buf);
- db->db_freed_in_flight = FALSE;
- dbuf_set_data(db, buf);
- db->db_state = DB_CACHED;
- } else {
- /* success */
- ASSERT(zio == NULL || zio->io_error == 0);
- dbuf_set_data(db, buf);
- db->db_state = DB_CACHED;
- }
- cv_broadcast(&db->db_changed);
- dbuf_rele_and_unlock(db, NULL, B_FALSE);
-}
-
-static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
- dnode_t *dn;
- zbookmark_phys_t zb;
- arc_flags_t aflags = ARC_FLAG_NOWAIT;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- /* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
- ASSERT(db->db_buf == NULL);
-
- if (db->db_blkid == DMU_BONUS_BLKID) {
- /*
- * The bonus length stored in the dnode may be less than
- * the maximum available space in the bonus buffer.
- */
- int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
- int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
-
- ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(max_bonuslen);
- arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
- if (bonuslen < max_bonuslen)
- bzero(db->db.db_data, max_bonuslen);
- if (bonuslen)
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
- DB_DNODE_EXIT(db);
- db->db_state = DB_CACHED;
- mutex_exit(&db->db_mtx);
- return;
- }
-
- /*
- * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
- * processes the delete record and clears the bp while we are waiting
- * for the dn_mtx (resulting in a "no" from block_freed).
- */
- if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
- (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
- BP_IS_HOLE(db->db_blkptr)))) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
- db->db.db_size));
- bzero(db->db.db_data, db->db.db_size);
-
- if (db->db_blkptr != NULL && db->db_level > 0 &&
- BP_IS_HOLE(db->db_blkptr) &&
- db->db_blkptr->blk_birth != 0) {
- blkptr_t *bps = db->db.db_data;
- for (int i = 0; i < ((1 <<
- DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
- i++) {
- blkptr_t *bp = &bps[i];
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
- 1 << dn->dn_indblkshift);
- BP_SET_LSIZE(bp,
- BP_GET_LEVEL(db->db_blkptr) == 1 ?
- dn->dn_datablksz :
- BP_GET_LSIZE(db->db_blkptr));
- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
- BP_SET_LEVEL(bp,
- BP_GET_LEVEL(db->db_blkptr) - 1);
- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
- }
- }
- DB_DNODE_EXIT(db);
- db->db_state = DB_CACHED;
- mutex_exit(&db->db_mtx);
- return;
- }
-
- DB_DNODE_EXIT(db);
-
- db->db_state = DB_READ;
- mutex_exit(&db->db_mtx);
-
- if (DBUF_IS_L2CACHEABLE(db))
- aflags |= ARC_FLAG_L2CACHE;
-
- SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
- db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
- db->db.db_object, db->db_level, db->db_blkid);
-
- dbuf_add_ref(db, NULL);
-
- (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
-}
-
-/*
- * This is our just-in-time copy function. It makes a copy of buffers that
- * have been modified in a previous transaction group before we access them in
- * the current active group.
- *
- * This function is used in three places: when we are dirtying a buffer for the
- * first time in a txg, when we are freeing a range in a dnode that includes
- * this buffer, and when we are accessing a buffer which was received compressed
- * and later referenced in a WRITE_BYREF record.
- *
- * Note that when we are called from dbuf_free_range() we do not put a hold on
- * the buffer, we just traverse the active dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_level == 0);
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
- if (dr == NULL ||
- (dr->dt.dl.dr_data !=
- ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
- return;
-
- /*
- * If the last dirty record for this dbuf has not yet synced
- * and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
- * or (if there a no active holders)
- * just null out the current db_data pointer.
- */
- ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DMU_BONUS_BLKID) {
- /* Note that the data bufs here are zio_bufs */
- dnode_t *dn = DB_DNODE(db);
- int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
- dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
- arc_space_consume(bonuslen, ARC_SPACE_BONUS);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
- } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = arc_buf_size(db->db_buf);
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa = db->db_objset->os_spa;
- enum zio_compress compress_type =
- arc_get_compression(db->db_buf);
-
- if (compress_type == ZIO_COMPRESS_OFF) {
- dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
- } else {
- ASSERT3U(type, ==, ARC_BUFC_DATA);
- dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
- size, arc_buf_lsize(db->db_buf), compress_type);
- }
- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
- } else {
- db->db_buf = NULL;
- dbuf_clear_data(db);
- }
-}
-
-int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
- int err = 0;
- boolean_t prefetch;
- dnode_t *dn;
-
- /*
- * We don't have to hold the mutex to check db_state because it
- * can't be freed while we have a hold on the buffer.
- */
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
- if (db->db_state == DB_NOFILL)
- return (SET_ERROR(EIO));
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
- prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
- DBUF_IS_CACHEABLE(db);
-
- mutex_enter(&db->db_mtx);
- if (db->db_state == DB_CACHED) {
- /*
- * If the arc buf is compressed, we need to decompress it to
- * read the data. This could happen during the "zfs receive" of
- * a stream which is compressed and deduplicated.
- */
- if (db->db_buf != NULL &&
- arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
- dbuf_fix_old_data(db,
- spa_syncing_txg(dmu_objset_spa(db->db_objset)));
- err = arc_decompress(db->db_buf);
- dbuf_set_data(db, db->db_buf);
- }
- mutex_exit(&db->db_mtx);
- if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
- boolean_t need_wait = B_FALSE;
-
- if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- need_wait = B_TRUE;
- }
- dbuf_read_impl(db, zio, flags);
-
- /* dbuf_read_impl has dropped db_mtx for us */
-
- if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
-
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
-
- if (need_wait)
- err = zio_wait(zio);
- } else {
- /*
- * Another reader came in while the dbuf was in flight
- * between UNCACHED and CACHED. Either a writer will finish
- * writing the buffer (sending the dbuf to CACHED) or the
- * first reader's request will reach the read_done callback
- * and send the dbuf to CACHED. Otherwise, a failure
- * occurred and the dbuf went to UNCACHED.
- */
- mutex_exit(&db->db_mtx);
- if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
-
- /* Skip the wait per the caller's request. */
- mutex_enter(&db->db_mtx);
- if ((flags & DB_RF_NEVERWAIT) == 0) {
- while (db->db_state == DB_READ ||
- db->db_state == DB_FILL) {
- ASSERT(db->db_state == DB_READ ||
- (flags & DB_RF_HAVESTRUCT) == 0);
- DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
- db, zio_t *, zio);
- cv_wait(&db->db_changed, &db->db_mtx);
- }
- if (db->db_state == DB_UNCACHED)
- err = SET_ERROR(EIO);
- }
- mutex_exit(&db->db_mtx);
- }
-
- return (err);
-}
-
-static void
-dbuf_noread(dmu_buf_impl_t *db)
-{
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ || db->db_state == DB_FILL)
- cv_wait(&db->db_changed, &db->db_mtx);
- if (db->db_state == DB_UNCACHED) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa = db->db_objset->os_spa;
-
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
- db->db_state = DB_FILL;
- } else if (db->db_state == DB_NOFILL) {
- dbuf_clear_data(db);
- } else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
- }
- mutex_exit(&db->db_mtx);
-}
-
-void
-dbuf_unoverride(dbuf_dirty_record_t *dr)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
- uint64_t txg = dr->dr_txg;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- /*
- * This assert is valid because dmu_sync() expects to be called by
- * a zilog's get_data while holding a range lock. This call only
- * comes from dbuf_dirty() callers who must also hold a range lock.
- */
- ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
- ASSERT(db->db_level == 0);
-
- if (db->db_blkid == DMU_BONUS_BLKID ||
- dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
- return;
-
- ASSERT(db->db_data_pending != dr);
-
- /* free this block */
- if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
- zio_free(db->db_objset->os_spa, txg, bp);
-
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- dr->dt.dl.dr_nopwrite = B_FALSE;
-
- /*
- * Release the already-written buffer, so we leave it in
- * a consistent dirty state. Note that all callers are
- * modifying the buffer, so they will immediately do
- * another (redundant) arc_release(). Therefore, leave
- * the buf thawed to save the effort of freezing &
- * immediately re-thawing it.
- */
- arc_release(dr->dt.dl.dr_data, db);
-}
-
-/*
- * Evict (if its unreferenced) or clear (if its referenced) any level-0
- * data blocks in the free range, so that any future readers will find
- * empty blocks.
- */
-void
-dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
- dmu_tx_t *tx)
-{
- dmu_buf_impl_t db_search;
- dmu_buf_impl_t *db, *db_next;
- uint64_t txg = tx->tx_txg;
- avl_index_t where;
-
- if (end_blkid > dn->dn_maxblkid &&
- !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
- end_blkid = dn->dn_maxblkid;
- dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
-
- db_search.db_level = 0;
- db_search.db_blkid = start_blkid;
- db_search.db_state = DB_SEARCH;
-
- mutex_enter(&dn->dn_dbufs_mtx);
- db = avl_find(&dn->dn_dbufs, &db_search, &where);
- ASSERT3P(db, ==, NULL);
-
- db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
-
- for (; db != NULL; db = db_next) {
- db_next = AVL_NEXT(&dn->dn_dbufs, db);
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
- if (db->db_level != 0 || db->db_blkid > end_blkid) {
- break;
- }
- ASSERT3U(db->db_blkid, >=, start_blkid);
-
- /* found a level 0 buffer in the range */
- mutex_enter(&db->db_mtx);
- if (dbuf_undirty(db, tx)) {
- /* mutex has been dropped and dbuf destroyed */
- continue;
- }
-
- if (db->db_state == DB_UNCACHED ||
- db->db_state == DB_NOFILL ||
- db->db_state == DB_EVICTING) {
- ASSERT(db->db.db_data == NULL);
- mutex_exit(&db->db_mtx);
- continue;
- }
- if (db->db_state == DB_READ || db->db_state == DB_FILL) {
- /* will be handled in dbuf_read_done or dbuf_rele */
- db->db_freed_in_flight = TRUE;
- mutex_exit(&db->db_mtx);
- continue;
- }
- if (zfs_refcount_count(&db->db_holds) == 0) {
- ASSERT(db->db_buf);
- dbuf_destroy(db);
- continue;
- }
- /* The dbuf is referenced */
-
- if (db->db_last_dirty != NULL) {
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- if (dr->dr_txg == txg) {
- /*
- * This buffer is "in-use", re-adjust the file
- * size to reflect that this buffer may
- * contain new data when we sync.
- */
- if (db->db_blkid != DMU_SPILL_BLKID &&
- db->db_blkid > dn->dn_maxblkid)
- dn->dn_maxblkid = db->db_blkid;
- dbuf_unoverride(dr);
- } else {
- /*
- * This dbuf is not dirty in the open context.
- * Either uncache it (if its not referenced in
- * the open context) or reset its contents to
- * empty.
- */
- dbuf_fix_old_data(db, txg);
- }
- }
- /* clear the contents if its cached */
- if (db->db_state == DB_CACHED) {
- ASSERT(db->db.db_data != NULL);
- arc_release(db->db_buf, db);
- bzero(db->db.db_data, db->db.db_size);
- arc_buf_freeze(db->db_buf);
- }
-
- mutex_exit(&db->db_mtx);
- }
- mutex_exit(&dn->dn_dbufs_mtx);
-}
-
-void
-dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
-{
- arc_buf_t *buf, *obuf;
- int osize = db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dnode_t *dn;
-
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
-
- /* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
- /*
- * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
- * is OK, because there can be no other references to the db
- * when we are changing its size, so no concurrent DB_FILL can
- * be happening.
- */
- /*
- * XXX we should be doing a dbuf_read, checking the return
- * value and returning that up to our callers
- */
- dmu_buf_will_dirty(&db->db, tx);
-
- /* create the data buffer for the new block */
- buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
-
- /* copy old block data to the new block */
- obuf = db->db_buf;
- bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
- /* zero the remainder */
- if (size > osize)
- bzero((uint8_t *)buf->b_data + osize, size - osize);
-
- mutex_enter(&db->db_mtx);
- dbuf_set_data(db, buf);
- arc_buf_destroy(obuf, db);
- db->db.db_size = size;
-
- if (db->db_level == 0) {
- ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
- db->db_last_dirty->dt.dl.dr_data = buf;
- }
- mutex_exit(&db->db_mtx);
-
- dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
- DB_DNODE_EXIT(db);
-}
-
-void
-dbuf_release_bp(dmu_buf_impl_t *db)
-{
- objset_t *os = db->db_objset;
-
- ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
- ASSERT(arc_released(os->os_phys_buf) ||
- list_link_active(&os->os_dsl_dataset->ds_synced_link));
- ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
- (void) arc_release(db->db_buf, db);
-}
-
-/*
- * We already have a dirty record for this TXG, and we are being
- * dirtied again.
- */
-static void
-dbuf_redirty(dbuf_dirty_record_t *dr)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
- /*
- * If this buffer has already been written out,
- * we now need to reset its state.
- */
- dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT &&
- db->db_state != DB_NOFILL) {
- /* Already released on initial dirty, so just thaw. */
- ASSERT(arc_released(db->db_buf));
- arc_buf_thaw(db->db_buf);
- }
- }
-}
-
-dbuf_dirty_record_t *
-dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- dnode_t *dn;
- objset_t *os;
- dbuf_dirty_record_t **drp, *dr;
- int drop_struct_lock = FALSE;
- int txgoff = tx->tx_txg & TXG_MASK;
-
- ASSERT(tx->tx_txg != 0);
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- DMU_TX_DIRTY_BUF(tx, db);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- /*
- * Shouldn't dirty a regular buffer in syncing context. Private
- * objects may be dirtied in syncing context, but only if they
- * were already pre-dirtied in open context.
- */
-#ifdef DEBUG
- if (dn->dn_objset->os_dsl_dataset != NULL) {
- rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
- RW_READER, FTAG);
- }
- ASSERT(!dmu_tx_is_syncing(tx) ||
- BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
- DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
- dn->dn_objset->os_dsl_dataset == NULL);
- if (dn->dn_objset->os_dsl_dataset != NULL)
- rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
-#endif
- /*
- * We make this assert for private objects as well, but after we
- * check if we're already dirty. They are allowed to re-dirty
- * in syncing context.
- */
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
- mutex_enter(&db->db_mtx);
- /*
- * XXX make this true for indirects too? The problem is that
- * transactions created with dmu_tx_create_assigned() from
- * syncing context don't bother holding ahead.
- */
- ASSERT(db->db_level != 0 ||
- db->db_state == DB_CACHED || db->db_state == DB_FILL ||
- db->db_state == DB_NOFILL);
-
- mutex_enter(&dn->dn_mtx);
- /*
- * Don't set dirtyctx to SYNC if we're just modifying this as we
- * initialize the objset.
- */
- if (dn->dn_dirtyctx == DN_UNDIRTIED) {
- if (dn->dn_objset->os_dsl_dataset != NULL) {
- rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
- RW_READER, FTAG);
- }
- if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
- dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
- DN_DIRTY_SYNC : DN_DIRTY_OPEN);
- ASSERT(dn->dn_dirtyctx_firstset == NULL);
- dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
- }
- if (dn->dn_objset->os_dsl_dataset != NULL) {
- rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
- FTAG);
- }
- }
-
- if (tx->tx_txg > dn->dn_dirty_txg)
- dn->dn_dirty_txg = tx->tx_txg;
- mutex_exit(&dn->dn_mtx);
-
- if (db->db_blkid == DMU_SPILL_BLKID)
- dn->dn_have_spill = B_TRUE;
-
- /*
- * If this buffer is already dirty, we're done.
- */
- drp = &db->db_last_dirty;
- ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
- db->db.db_object == DMU_META_DNODE_OBJECT);
- while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
- drp = &dr->dr_next;
- if (dr && dr->dr_txg == tx->tx_txg) {
- DB_DNODE_EXIT(db);
-
- dbuf_redirty(dr);
- mutex_exit(&db->db_mtx);
- return (dr);
- }
-
- /*
- * Only valid if not already dirty.
- */
- ASSERT(dn->dn_object == 0 ||
- dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
- ASSERT3U(dn->dn_nlevels, >, db->db_level);
-
- /*
- * We should only be dirtying in syncing context if it's the
- * mos or we're initializing the os or it's a special object.
- * However, we are allowed to dirty in syncing context provided
- * we already dirtied it in open context. Hence we must make
- * this assertion only if we're not already dirty.
- */
- os = dn->dn_objset;
- VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
-#ifdef DEBUG
- if (dn->dn_objset->os_dsl_dataset != NULL)
- rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
- os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
- if (dn->dn_objset->os_dsl_dataset != NULL)
- rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
-#endif
- ASSERT(db->db.db_size != 0);
-
- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
- if (db->db_blkid != DMU_BONUS_BLKID) {
- dmu_objset_willuse_space(os, db->db.db_size, tx);
- }
-
- /*
- * If this buffer is dirty in an old transaction group we need
- * to make a copy of it so that the changes we make in this
- * transaction group won't leak out when we sync the older txg.
- */
- dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
- list_link_init(&dr->dr_dirty_node);
- if (db->db_level == 0) {
- void *data_old = db->db_buf;
-
- if (db->db_state != DB_NOFILL) {
- if (db->db_blkid == DMU_BONUS_BLKID) {
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db.db_data;
- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
- /*
- * Release the data buffer from the cache so
- * that we can modify it without impacting
- * possible other users of this cached data
- * block. Note that indirect blocks and
- * private objects are not released until the
- * syncing state (since they are only modified
- * then).
- */
- arc_release(db->db_buf, db);
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db_buf;
- }
- ASSERT(data_old != NULL);
- }
- dr->dt.dl.dr_data = data_old;
- } else {
- mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
- list_create(&dr->dt.di.dr_children,
- sizeof (dbuf_dirty_record_t),
- offsetof(dbuf_dirty_record_t, dr_dirty_node));
- }
- if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
- dr->dr_accounted = db->db.db_size;
- dr->dr_dbuf = db;
- dr->dr_txg = tx->tx_txg;
- dr->dr_next = *drp;
- *drp = dr;
-
- /*
- * We could have been freed_in_flight between the dbuf_noread
- * and dbuf_dirty. We win, as though the dbuf_noread() had
- * happened after the free.
- */
- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- db->db_blkid != DMU_SPILL_BLKID) {
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_free_ranges[txgoff] != NULL) {
- range_tree_clear(dn->dn_free_ranges[txgoff],
- db->db_blkid, 1);
- }
- mutex_exit(&dn->dn_mtx);
- db->db_freed_in_flight = FALSE;
- }
-
- /*
- * This buffer is now part of this txg
- */
- dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
- db->db_dirtycnt += 1;
- ASSERT3U(db->db_dirtycnt, <=, 3);
-
- mutex_exit(&db->db_mtx);
-
- if (db->db_blkid == DMU_BONUS_BLKID ||
- db->db_blkid == DMU_SPILL_BLKID) {
- mutex_enter(&dn->dn_mtx);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
- mutex_exit(&dn->dn_mtx);
- dnode_setdirty(dn, tx);
- DB_DNODE_EXIT(db);
- return (dr);
- }
-
- /*
- * The dn_struct_rwlock prevents db_blkptr from changing
- * due to a write from syncing context completing
- * while we are running, so we want to acquire it before
- * looking at db_blkptr.
- */
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
- }
-
- /*
- * We need to hold the dn_struct_rwlock to make this assertion,
- * because it protects dn_phys / dn_next_nlevels from changing.
- */
- ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
- dn->dn_phys->dn_nlevels > db->db_level ||
- dn->dn_next_nlevels[txgoff] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
-
- /*
- * If we are overwriting a dedup BP, then unless it is snapshotted,
- * when we get to syncing context we will need to decrement its
- * refcount in the DDT. Prefetch the relevant DDT block so that
- * syncing context won't have to wait for the i/o.
- */
- ddt_prefetch(os->os_spa, db->db_blkptr);
-
- if (db->db_level == 0) {
- dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
- ASSERT(dn->dn_maxblkid >= db->db_blkid);
- }
-
- if (db->db_level+1 < dn->dn_nlevels) {
- dmu_buf_impl_t *parent = db->db_parent;
- dbuf_dirty_record_t *di;
- int parent_held = FALSE;
-
- if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- parent = dbuf_hold_level(dn, db->db_level+1,
- db->db_blkid >> epbs, FTAG);
- ASSERT(parent != NULL);
- parent_held = TRUE;
- }
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
- ASSERT3U(db->db_level+1, ==, parent->db_level);
- di = dbuf_dirty(parent, tx);
- if (parent_held)
- dbuf_rele(parent, FTAG);
-
- mutex_enter(&db->db_mtx);
- /*
- * Since we've dropped the mutex, it's possible that
- * dbuf_undirty() might have changed this out from under us.
- */
- if (db->db_last_dirty == dr ||
- dn->dn_object == DMU_META_DNODE_OBJECT) {
- mutex_enter(&di->dt.di.dr_mtx);
- ASSERT3U(di->dr_txg, ==, tx->tx_txg);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&di->dt.di.dr_children, dr);
- mutex_exit(&di->dt.di.dr_mtx);
- dr->dr_parent = di;
- }
- mutex_exit(&db->db_mtx);
- } else {
- ASSERT(db->db_level+1 == dn->dn_nlevels);
- ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
- mutex_enter(&dn->dn_mtx);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
- mutex_exit(&dn->dn_mtx);
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
- }
-
- dnode_setdirty(dn, tx);
- DB_DNODE_EXIT(db);
- return (dr);
-}
-
-/*
- * Undirty a buffer in the transaction group referenced by the given
- * transaction. Return whether this evicted the dbuf.
- */
-static boolean_t
-dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- dnode_t *dn;
- uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr, **drp;
-
- ASSERT(txg != 0);
-
- /*
- * Due to our use of dn_nlevels below, this can only be called
- * in open context, unless we are operating on the MOS.
- * From syncing context, dn_nlevels may be different from the
- * dn_nlevels used when dbuf was dirtied.
- */
- ASSERT(db->db_objset ==
- dmu_objset_pool(db->db_objset)->dp_meta_objset ||
- txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT0(db->db_level);
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- /*
- * If this buffer is not dirty, we're done.
- */
- for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
- if (dr->dr_txg <= txg)
- break;
- if (dr == NULL || dr->dr_txg < txg)
- return (B_FALSE);
- ASSERT(dr->dr_txg == txg);
- ASSERT(dr->dr_dbuf == db);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
-
- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
- ASSERT(db->db.db_size != 0);
-
- dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
- dr->dr_accounted, txg);
-
- *drp = dr->dr_next;
-
- /*
- * Note that there are three places in dbuf_dirty()
- * where this dirty record may be put on a list.
- * Make sure to do a list_remove corresponding to
- * every one of those list_insert calls.
- */
- if (dr->dr_parent) {
- mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
- list_remove(&dr->dr_parent->dt.di.dr_children, dr);
- mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
- } else if (db->db_blkid == DMU_SPILL_BLKID ||
- db->db_level + 1 == dn->dn_nlevels) {
- ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
- mutex_enter(&dn->dn_mtx);
- list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
- mutex_exit(&dn->dn_mtx);
- }
- DB_DNODE_EXIT(db);
-
- if (db->db_state != DB_NOFILL) {
- dbuf_unoverride(dr);
-
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
- arc_buf_destroy(dr->dt.dl.dr_data, db);
- }
-
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
-
- if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
- dbuf_destroy(db);
- return (B_TRUE);
- }
-
- return (B_FALSE);
-}
-
-void
-dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
-
- ASSERT(tx->tx_txg != 0);
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
- /*
- * Quick check for dirtyness. For already dirty blocks, this
- * reduces runtime of this function by >90%, and overall performance
- * by 50% for some workloads (e.g. file deletion with indirect blocks
- * cached).
- */
- mutex_enter(&db->db_mtx);
- dbuf_dirty_record_t *dr;
- for (dr = db->db_last_dirty;
- dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
- /*
- * It's possible that it is already dirty but not cached,
- * because there are some calls to dbuf_dirty() that don't
- * go through dmu_buf_will_dirty().
- */
- if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
- /* This dbuf is already dirty and cached. */
- dbuf_redirty(dr);
- mutex_exit(&db->db_mtx);
- return;
- }
- }
- mutex_exit(&db->db_mtx);
-
- DB_DNODE_ENTER(db);
- if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
- rf |= DB_RF_HAVESTRUCT;
- DB_DNODE_EXIT(db);
- (void) dbuf_read(db, NULL, rf);
- (void) dbuf_dirty(db, tx);
-}
-
-void
-dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- db->db_state = DB_NOFILL;
-
- dmu_buf_will_fill(db_fake, tx);
-}
-
-void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(tx->tx_txg != 0);
- ASSERT(db->db_level == 0);
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
- dmu_tx_private_ok(tx));
-
- dbuf_noread(db);
- (void) dbuf_dirty(db, tx);
-}
-
-#pragma weak dmu_buf_fill_done = dbuf_fill_done
-/* ARGSUSED */
-void
-dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- mutex_enter(&db->db_mtx);
- DBUF_VERIFY(db);
-
- if (db->db_state == DB_FILL) {
- if (db->db_level == 0 && db->db_freed_in_flight) {
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- /* we were freed while filling */
- /* XXX dbuf_undirty? */
- bzero(db->db.db_data, db->db.db_size);
- db->db_freed_in_flight = FALSE;
- }
- db->db_state = DB_CACHED;
- cv_broadcast(&db->db_changed);
- }
- mutex_exit(&db->db_mtx);
-}
-
-void
-dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
- bp_embedded_type_t etype, enum zio_compress comp,
- int uncompressed_size, int compressed_size, int byteorder,
- dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- struct dirty_leaf *dl;
- dmu_object_type_t type;
-
- if (etype == BP_EMBEDDED_TYPE_DATA) {
- ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
- SPA_FEATURE_EMBEDDED_DATA));
- }
-
- DB_DNODE_ENTER(db);
- type = DB_DNODE(db)->dn_type;
- DB_DNODE_EXIT(db);
-
- ASSERT0(db->db_level);
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
- dmu_buf_will_not_fill(dbuf, tx);
-
- ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
- dl = &db->db_last_dirty->dt.dl;
- encode_embedded_bp_compressed(&dl->dr_overridden_by,
- data, comp, uncompressed_size, compressed_size);
- BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
- BP_SET_TYPE(&dl->dr_overridden_by, type);
- BP_SET_LEVEL(&dl->dr_overridden_by, 0);
- BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
-
- dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
-}
-
-/*
- * Directly assign a provided arc buf to a given dbuf if it's not referenced
- * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
- */
-void
-dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
-{
- ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(db->db_level == 0);
- ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
- ASSERT(buf != NULL);
- ASSERT(arc_buf_lsize(buf) == db->db.db_size);
- ASSERT(tx->tx_txg != 0);
-
- arc_return_buf(buf, db);
- ASSERT(arc_released(buf));
-
- mutex_enter(&db->db_mtx);
-
- while (db->db_state == DB_READ || db->db_state == DB_FILL)
- cv_wait(&db->db_changed, &db->db_mtx);
-
- ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
-
- if (db->db_state == DB_CACHED &&
- zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- bcopy(buf->b_data, db->db.db_data, db->db.db_size);
- arc_buf_destroy(buf, db);
- xuio_stat_wbuf_copied();
- return;
- }
-
- xuio_stat_wbuf_nocopy();
- if (db->db_state == DB_CACHED) {
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- ASSERT(db->db_buf != NULL);
- if (dr != NULL && dr->dr_txg == tx->tx_txg) {
- ASSERT(dr->dt.dl.dr_data == db->db_buf);
- if (!arc_released(db->db_buf)) {
- ASSERT(dr->dt.dl.dr_override_state ==
- DR_OVERRIDDEN);
- arc_release(db->db_buf, db);
- }
- dr->dt.dl.dr_data = buf;
- arc_buf_destroy(db->db_buf, db);
- } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
- arc_release(db->db_buf, db);
- arc_buf_destroy(db->db_buf, db);
- }
- db->db_buf = NULL;
- }
- ASSERT(db->db_buf == NULL);
- dbuf_set_data(db, buf);
- db->db_state = DB_FILL;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- dmu_buf_fill_done(&db->db, tx);
-}
-
-void
-dbuf_destroy(dmu_buf_impl_t *db)
-{
- dnode_t *dn;
- dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(zfs_refcount_is_zero(&db->db_holds));
-
- if (db->db_buf != NULL) {
- arc_buf_destroy(db->db_buf, db);
- db->db_buf = NULL;
- }
-
- if (db->db_blkid == DMU_BONUS_BLKID) {
- int slots = DB_DNODE(db)->dn_num_slots;
- int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- if (db->db.db_data != NULL) {
- zio_buf_free(db->db.db_data, bonuslen);
- arc_space_return(bonuslen, ARC_SPACE_BONUS);
- db->db_state = DB_UNCACHED;
- }
- }
-
- dbuf_clear_data(db);
-
- if (multilist_link_active(&db->db_cache_link)) {
- ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
- db->db_caching_status == DB_DBUF_METADATA_CACHE);
-
- multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
- (void) zfs_refcount_remove_many(
- &dbuf_caches[db->db_caching_status].size,
- db->db.db_size, db);
-
- if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMPDOWN(metadata_cache_count);
- } else {
- DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
- DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
- }
- db->db_caching_status = DB_NO_CACHE;
- }
-
- ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
- ASSERT(db->db_data_pending == NULL);
-
- db->db_state = DB_EVICTING;
- db->db_blkptr = NULL;
-
- /*
- * Now that db_state is DB_EVICTING, nobody else can find this via
- * the hash table. We can now drop db_mtx, which allows us to
- * acquire the dn_dbufs_mtx.
- */
- mutex_exit(&db->db_mtx);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- dndb = dn->dn_dbuf;
- if (db->db_blkid != DMU_BONUS_BLKID) {
- boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
- if (needlock)
- mutex_enter(&dn->dn_dbufs_mtx);
- avl_remove(&dn->dn_dbufs, db);
- membar_producer();
- DB_DNODE_EXIT(db);
- if (needlock)
- mutex_exit(&dn->dn_dbufs_mtx);
- /*
- * Decrementing the dbuf count means that the hold corresponding
- * to the removed dbuf is no longer discounted in dnode_move(),
- * so the dnode cannot be moved until after we release the hold.
- * The membar_producer() ensures visibility of the decremented
- * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
- * release any lock.
- */
- mutex_enter(&dn->dn_mtx);
- dnode_rele_and_unlock(dn, db, B_TRUE);
- db->db_dnode_handle = NULL;
-
- dbuf_hash_remove(db);
- } else {
- DB_DNODE_EXIT(db);
- }
-
- ASSERT(zfs_refcount_is_zero(&db->db_holds));
-
- db->db_parent = NULL;
-
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_hash_next == NULL);
- ASSERT(db->db_blkptr == NULL);
- ASSERT(db->db_data_pending == NULL);
- ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
- ASSERT(!multilist_link_active(&db->db_cache_link));
-
- kmem_cache_free(dbuf_kmem_cache, db);
- arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
- /*
- * If this dbuf is referenced from an indirect dbuf,
- * decrement the ref count on the indirect dbuf.
- */
- if (parent && parent != dndb) {
- mutex_enter(&parent->db_mtx);
- dbuf_rele_and_unlock(parent, db, B_TRUE);
- }
-}
-
-/*
- * Note: While bpp will always be updated if the function returns success,
- * parentp will not be updated if the dnode does not have dn_dbuf filled in;
- * this happens when the dnode is the meta-dnode, or a userused or groupused
- * object.
- */
-__attribute__((always_inline))
-static inline int
-dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
- dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
-{
- *parentp = NULL;
- *bpp = NULL;
-
- ASSERT(blkid != DMU_BONUS_BLKID);
-
- if (blkid == DMU_SPILL_BLKID) {
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_have_spill &&
- (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
- *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
- else
- *bpp = NULL;
- dbuf_add_ref(dn->dn_dbuf, NULL);
- *parentp = dn->dn_dbuf;
- mutex_exit(&dn->dn_mtx);
- return (0);
- }
-
- int nlevels =
- (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- ASSERT3U(level * epbs, <, 64);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- /*
- * This assertion shouldn't trip as long as the max indirect block size
- * is less than 1M. The reason for this is that up to that point,
- * the number of levels required to address an entire object with blocks
- * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
- * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
- * (i.e. we can address the entire object), objects will all use at most
- * N-1 levels and the assertion won't overflow. However, once epbs is
- * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
- * enough to address an entire object, so objects will have 5 levels,
- * but then this assertion will overflow.
- *
- * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
- * need to redo this logic to handle overflows.
- */
- ASSERT(level >= nlevels ||
- ((nlevels - level - 1) * epbs) +
- highbit64(dn->dn_phys->dn_nblkptr) <= 64);
- if (level >= nlevels ||
- blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
- ((nlevels - level - 1) * epbs)) ||
- (fail_sparse &&
- blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
- /* the buffer has no parent yet */
- return (SET_ERROR(ENOENT));
- } else if (level < nlevels-1) {
- /* this block is referenced from an indirect block */
- int err;
- if (dh == NULL) {
- err = dbuf_hold_impl(dn, level+1,
- blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
- } else {
- __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
- blkid >> epbs, fail_sparse, FALSE, NULL,
- parentp, dh->dh_depth + 1);
- err = __dbuf_hold_impl(dh + 1);
- }
- if (err)
- return (err);
- err = dbuf_read(*parentp, NULL,
- (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
- if (err) {
- dbuf_rele(*parentp, NULL);
- *parentp = NULL;
- return (err);
- }
- *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
- (blkid & ((1ULL << epbs) - 1));
- if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
- ASSERT(BP_IS_HOLE(*bpp));
- return (0);
- } else {
- /* the block is referenced from the dnode */
- ASSERT3U(level, ==, nlevels-1);
- ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
- blkid < dn->dn_phys->dn_nblkptr);
- if (dn->dn_dbuf) {
- dbuf_add_ref(dn->dn_dbuf, NULL);
- *parentp = dn->dn_dbuf;
- }
- *bpp = &dn->dn_phys->dn_blkptr[blkid];
- return (0);
- }
-}
-
-static dmu_buf_impl_t *
-dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
- dmu_buf_impl_t *parent, blkptr_t *blkptr)
-{
- objset_t *os = dn->dn_objset;
- dmu_buf_impl_t *db, *odb;
-
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- ASSERT(dn->dn_type != DMU_OT_NONE);
-
- db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
-
- db->db_objset = os;
- db->db.db_object = dn->dn_object;
- db->db_level = level;
- db->db_blkid = blkid;
- db->db_last_dirty = NULL;
- db->db_dirtycnt = 0;
- db->db_dnode_handle = dn->dn_handle;
- db->db_parent = parent;
- db->db_blkptr = blkptr;
-
- db->db_user = NULL;
- db->db_user_immediate_evict = FALSE;
- db->db_freed_in_flight = FALSE;
- db->db_pending_evict = FALSE;
-
- if (blkid == DMU_BONUS_BLKID) {
- ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
- (dn->dn_nblkptr-1) * sizeof (blkptr_t);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- db->db.db_offset = DMU_BONUS_BLKID;
- db->db_state = DB_UNCACHED;
- db->db_caching_status = DB_NO_CACHE;
- /* the bonus dbuf is not placed in the hash table */
- arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
- return (db);
- } else if (blkid == DMU_SPILL_BLKID) {
- db->db.db_size = (blkptr != NULL) ?
- BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
- db->db.db_offset = 0;
- } else {
- int blocksize =
- db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
- db->db.db_size = blocksize;
- db->db.db_offset = db->db_blkid * blocksize;
- }
-
- /*
- * Hold the dn_dbufs_mtx while we get the new dbuf
- * in the hash table *and* added to the dbufs list.
- * This prevents a possible deadlock with someone
- * trying to look up this dbuf before its added to the
- * dn_dbufs list.
- */
- mutex_enter(&dn->dn_dbufs_mtx);
- db->db_state = DB_EVICTING;
- if ((odb = dbuf_hash_insert(db)) != NULL) {
- /* someone else inserted it first */
- kmem_cache_free(dbuf_kmem_cache, db);
- mutex_exit(&dn->dn_dbufs_mtx);
- DBUF_STAT_BUMP(hash_insert_race);
- return (odb);
- }
- avl_add(&dn->dn_dbufs, db);
-
- db->db_state = DB_UNCACHED;
- db->db_caching_status = DB_NO_CACHE;
- mutex_exit(&dn->dn_dbufs_mtx);
- arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
- if (parent && parent != dn->dn_dbuf)
- dbuf_add_ref(parent, db);
-
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- zfs_refcount_count(&dn->dn_holds) > 0);
- (void) zfs_refcount_add(&dn->dn_holds, db);
-
- dprintf_dbuf(db, "db=%p\n", db);
-
- return (db);
-}
-
-typedef struct dbuf_prefetch_arg {
- spa_t *dpa_spa; /* The spa to issue the prefetch in. */
- zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
- int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
- int dpa_curlevel; /* The current level that we're reading */
- dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
- zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
- zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
- arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
-} dbuf_prefetch_arg_t;
-
-/*
- * Actually issue the prefetch read for the block given.
- */
-static void
-dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
-{
- if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
- return;
-
- arc_flags_t aflags =
- dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-
- ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
- ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
- ASSERT(dpa->dpa_zio != NULL);
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
- dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &dpa->dpa_zb);
-}
-
-/*
- * Called when an indirect block above our prefetch target is read in. This
- * will either read in the next indirect block down the tree or issue the actual
- * prefetch if the next block down is our target.
- */
-static void
-dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
- const blkptr_t *iobp, arc_buf_t *abuf, void *private)
-{
- dbuf_prefetch_arg_t *dpa = private;
-
- ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
- ASSERT3S(dpa->dpa_curlevel, >, 0);
-
- if (abuf == NULL) {
- ASSERT(zio == NULL || zio->io_error != 0);
- kmem_free(dpa, sizeof (*dpa));
- return;
- }
- ASSERT(zio == NULL || zio->io_error == 0);
-
- /*
- * The dpa_dnode is only valid if we are called with a NULL
- * zio. This indicates that the arc_read() returned without
- * first calling zio_read() to issue a physical read. Once
- * a physical read is made the dpa_dnode must be invalidated
- * as the locks guarding it may have been dropped. If the
- * dpa_dnode is still valid, then we want to add it to the dbuf
- * cache. To do so, we must hold the dbuf associated with the block
- * we just prefetched, read its contents so that we associate it
- * with an arc_buf_t, and then release it.
- */
- if (zio != NULL) {
- ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
- if (zio->io_flags & ZIO_FLAG_RAW) {
- ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
- } else {
- ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
- }
- ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
-
- dpa->dpa_dnode = NULL;
- } else if (dpa->dpa_dnode != NULL) {
- uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
- (dpa->dpa_epbs * (dpa->dpa_curlevel -
- dpa->dpa_zb.zb_level));
- dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
- dpa->dpa_curlevel, curblkid, FTAG);
- (void) dbuf_read(db, NULL,
- DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
- dbuf_rele(db, FTAG);
- }
-
- if (abuf == NULL) {
- kmem_free(dpa, sizeof(*dpa));
- return;
- }
-
- dpa->dpa_curlevel--;
-
- uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
- (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
- blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
- P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp)) {
- kmem_free(dpa, sizeof (*dpa));
- } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
- ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
- dbuf_issue_final_prefetch(dpa, bp);
- kmem_free(dpa, sizeof (*dpa));
- } else {
- arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
- zbookmark_phys_t zb;
-
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
- iter_aflags |= ARC_FLAG_L2CACHE;
-
- ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
-
- SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
- dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
-
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &iter_aflags, &zb);
- }
-
- arc_buf_destroy(abuf, private);
-}
-
-/*
- * Issue prefetch reads for the given block on the given level. If the indirect
- * blocks above that block are not in memory, we will read them in
- * asynchronously. As a result, this call never blocks waiting for a read to
- * complete.
- */
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
- arc_flags_t aflags)
-{
- blkptr_t bp;
- int epbs, nlevels, curlevel;
- uint64_t curblkid;
-
- ASSERT(blkid != DMU_BONUS_BLKID);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-
- if (blkid > dn->dn_maxblkid)
- return;
-
- if (dnode_block_freed(dn, blkid))
- return;
-
- /*
- * This dnode hasn't been written to disk yet, so there's nothing to
- * prefetch.
- */
- nlevels = dn->dn_phys->dn_nlevels;
- if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
- return;
-
- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
- return;
-
- dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
- level, blkid);
- if (db != NULL) {
- mutex_exit(&db->db_mtx);
- /*
- * This dbuf already exists. It is either CACHED, or
- * (we assume) about to be read or filled.
- */
- return;
- }
-
- /*
- * Find the closest ancestor (indirect block) of the target block
- * that is present in the cache. In this indirect block, we will
- * find the bp that is at curlevel, curblkid.
- */
- curlevel = level;
- curblkid = blkid;
- while (curlevel < nlevels - 1) {
- int parent_level = curlevel + 1;
- uint64_t parent_blkid = curblkid >> epbs;
- dmu_buf_impl_t *db;
-
- if (dbuf_hold_impl(dn, parent_level, parent_blkid,
- FALSE, TRUE, FTAG, &db) == 0) {
- blkptr_t *bpp = db->db_buf->b_data;
- bp = bpp[P2PHASE(curblkid, 1 << epbs)];
- dbuf_rele(db, FTAG);
- break;
- }
-
- curlevel = parent_level;
- curblkid = parent_blkid;
- }
-
- if (curlevel == nlevels - 1) {
- /* No cached indirect blocks found. */
- ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
- bp = dn->dn_phys->dn_blkptr[curblkid];
- }
- if (BP_IS_HOLE(&bp))
- return;
-
- ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
-
- zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
- ZIO_FLAG_CANFAIL);
-
- dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
- dn->dn_object, level, blkid);
- dpa->dpa_curlevel = curlevel;
- dpa->dpa_prio = prio;
- dpa->dpa_aflags = aflags;
- dpa->dpa_spa = dn->dn_objset->os_spa;
- dpa->dpa_dnode = dn;
- dpa->dpa_epbs = epbs;
- dpa->dpa_zio = pio;
-
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
- dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
-
- /*
- * If we have the indirect just above us, no need to do the asynchronous
- * prefetch chain; we'll just run the last step ourselves. If we're at
- * a higher level, though, we want to issue the prefetches for all the
- * indirect blocks asynchronously, so we can go on with whatever we were
- * doing.
- */
- if (curlevel == level) {
- ASSERT3U(curblkid, ==, blkid);
- dbuf_issue_final_prefetch(dpa, &bp);
- kmem_free(dpa, sizeof (*dpa));
- } else {
- arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
- zbookmark_phys_t zb;
-
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
- iter_aflags |= ARC_FLAG_L2CACHE;
-
- SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
- dn->dn_object, curlevel, curblkid);
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- &bp, dbuf_prefetch_indirect_done, dpa, prio,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &iter_aflags, &zb);
- }
- /*
- * We use pio here instead of dpa_zio since it's possible that
- * dpa may have already been freed.
- */
- zio_nowait(pio);
-}
-
-#define DBUF_HOLD_IMPL_MAX_DEPTH 20
-
-/*
- * Helper function for __dbuf_hold_impl() to copy a buffer. Handles
- * the case of encrypted, compressed and uncompressed buffers by
- * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
- * arc_alloc_compressed_buf() or arc_alloc_buf().*
- *
- * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl().
- */
-noinline static void
-dbuf_hold_copy(struct dbuf_hold_impl_data *dh)
-{
- dnode_t *dn = dh->dh_dn;
- dmu_buf_impl_t *db = dh->dh_db;
- dbuf_dirty_record_t *dr = dh->dh_dr;
- arc_buf_t *data = dr->dt.dl.dr_data;
-
- enum zio_compress compress_type = arc_get_compression(data);
-
- if (compress_type != ZIO_COMPRESS_OFF) {
- dbuf_set_data(db, arc_alloc_compressed_buf(
- dn->dn_objset->os_spa, db, arc_buf_size(data),
- arc_buf_lsize(data), compress_type));
- } else {
- dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
- DBUF_GET_BUFC_TYPE(db), db->db.db_size));
- }
-
- bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
-}
-
-/*
- * Returns with db_holds incremented, and db_mtx not held.
- * Note: dn_struct_rwlock must be held.
- */
-static int
-__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
-{
- ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
- dh->dh_parent = NULL;
-
- ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
- ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
- ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
-
- *(dh->dh_dbp) = NULL;
-
- /* dbuf_find() returns with db_mtx held */
- dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
- dh->dh_level, dh->dh_blkid);
-
- if (dh->dh_db == NULL) {
- dh->dh_bp = NULL;
-
- if (dh->dh_fail_uncached)
- return (SET_ERROR(ENOENT));
-
- ASSERT3P(dh->dh_parent, ==, NULL);
- dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
- dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh);
- if (dh->dh_fail_sparse) {
- if (dh->dh_err == 0 &&
- dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
- dh->dh_err = SET_ERROR(ENOENT);
- if (dh->dh_err) {
- if (dh->dh_parent)
- dbuf_rele(dh->dh_parent, NULL);
- return (dh->dh_err);
- }
- }
- if (dh->dh_err && dh->dh_err != ENOENT)
- return (dh->dh_err);
- dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
- dh->dh_parent, dh->dh_bp);
- }
-
- if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
- mutex_exit(&dh->dh_db->db_mtx);
- return (SET_ERROR(ENOENT));
- }
-
- if (dh->dh_db->db_buf != NULL) {
- arc_buf_access(dh->dh_db->db_buf);
- ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
- }
-
- ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
-
- /*
- * If this buffer is currently syncing out, and we are are
- * still referencing it from db_data, we need to make a copy
- * of it in case we decide we want to dirty it again in this txg.
- */
- if (dh->dh_db->db_level == 0 &&
- dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
- dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
- dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
- dh->dh_dr = dh->dh_db->db_data_pending;
- if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
- dbuf_hold_copy(dh);
- }
-
- if (multilist_link_active(&dh->dh_db->db_cache_link)) {
- ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
- ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
- dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
-
- multilist_remove(
- dbuf_caches[dh->dh_db->db_caching_status].cache,
- dh->dh_db);
- (void) zfs_refcount_remove_many(
- &dbuf_caches[dh->dh_db->db_caching_status].size,
- dh->dh_db->db.db_size, dh->dh_db);
-
- if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMPDOWN(metadata_cache_count);
- } else {
- DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
- DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
- dh->dh_db->db.db_size);
- }
- dh->dh_db->db_caching_status = DB_NO_CACHE;
- }
- (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
- DBUF_VERIFY(dh->dh_db);
- mutex_exit(&dh->dh_db->db_mtx);
-
- /* NOTE: we can't rele the parent until after we drop the db_mtx */
- if (dh->dh_parent)
- dbuf_rele(dh->dh_parent, NULL);
-
- ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
- ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
- ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
- *(dh->dh_dbp) = dh->dh_db;
-
- return (0);
-}
-
-/*
- * The following code preserves the recursive function dbuf_hold_impl()
- * but moves the local variables AND function arguments to the heap to
- * minimize the stack frame size. Enough space is initially allocated
- * on the stack for 20 levels of recursion.
- */
-int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
- boolean_t fail_sparse, boolean_t fail_uncached,
- void *tag, dmu_buf_impl_t **dbp)
-{
- struct dbuf_hold_impl_data *dh;
- int error;
-
- dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) *
- DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
- __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
- fail_uncached, tag, dbp, 0);
-
- error = __dbuf_hold_impl(dh);
-
- kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
- DBUF_HOLD_IMPL_MAX_DEPTH);
-
- return (error);
-}
-
-static void
-__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
- dnode_t *dn, uint8_t level, uint64_t blkid,
- boolean_t fail_sparse, boolean_t fail_uncached,
- void *tag, dmu_buf_impl_t **dbp, int depth)
-{
- dh->dh_dn = dn;
- dh->dh_level = level;
- dh->dh_blkid = blkid;
-
- dh->dh_fail_sparse = fail_sparse;
- dh->dh_fail_uncached = fail_uncached;
-
- dh->dh_tag = tag;
- dh->dh_dbp = dbp;
-
- dh->dh_db = NULL;
- dh->dh_parent = NULL;
- dh->dh_bp = NULL;
- dh->dh_err = 0;
- dh->dh_dr = NULL;
-
- dh->dh_depth = depth;
-}
-
-dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
-{
- return (dbuf_hold_level(dn, 0, blkid, tag));
-}
-
-dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
-{
- dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
- return (err ? NULL : db);
-}
-
-void
-dbuf_create_bonus(dnode_t *dn)
-{
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
- ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
-}
-
-int
-dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- dnode_t *dn;
-
- if (db->db_blkid != DMU_SPILL_BLKID)
- return (SET_ERROR(ENOTSUP));
- if (blksz == 0)
- blksz = SPA_MINBLOCKSIZE;
- ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- dbuf_new_size(db, blksz, tx);
- rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(db);
-
- return (0);
-}
-
-void
-dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
-{
- dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
-}
-
-#pragma weak dmu_buf_add_ref = dbuf_add_ref
-void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
-{
- int64_t holds = zfs_refcount_add(&db->db_holds, tag);
- ASSERT3S(holds, >, 1);
-}
-
-#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
-boolean_t
-dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
- void *tag)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- dmu_buf_impl_t *found_db;
- boolean_t result = B_FALSE;
-
- if (db->db_blkid == DMU_BONUS_BLKID)
- found_db = dbuf_find_bonus(os, obj);
- else
- found_db = dbuf_find(os, obj, 0, blkid);
-
- if (found_db != NULL) {
- if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
- (void) zfs_refcount_add(&db->db_holds, tag);
- result = B_TRUE;
- }
- mutex_exit(&db->db_mtx);
- }
- return (result);
-}
-
-/*
- * If you call dbuf_rele() you had better not be referencing the dnode handle
- * unless you have some other direct or indirect hold on the dnode. (An indirect
- * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
- * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
- * dnode's parent dbuf evicting its dnode handles.
- */
-void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
-{
- mutex_enter(&db->db_mtx);
- dbuf_rele_and_unlock(db, tag, B_FALSE);
-}
-
-void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
-{
- dbuf_rele((dmu_buf_impl_t *)db, tag);
-}
-
-/*
- * dbuf_rele() for an already-locked dbuf. This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
- * argument should be set if we are already in the dbuf-evicting code
- * path, in which case we don't want to recursively evict. This allows us to
- * avoid deeply nested stacks that would have a call flow similar to this:
- *
- * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
- * ^ |
- * | |
- * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
- *
- */
-void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
-{
- int64_t holds;
- uint64_t size;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- DBUF_VERIFY(db);
-
- /*
- * Remove the reference to the dbuf before removing its hold on the
- * dnode so we can guarantee in dnode_move() that a referenced bonus
- * buffer has a corresponding dnode hold.
- */
- holds = zfs_refcount_remove(&db->db_holds, tag);
- ASSERT(holds >= 0);
-
- /*
- * We can't freeze indirects if there is a possibility that they
- * may be modified in the current syncing context.
- */
- if (db->db_buf != NULL &&
- holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
- arc_buf_freeze(db->db_buf);
- }
-
- if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_user_immediate_evict)
- dbuf_evict_user(db);
-
- if (holds == 0) {
- if (db->db_blkid == DMU_BONUS_BLKID) {
- dnode_t *dn;
- boolean_t evict_dbuf = db->db_pending_evict;
-
- /*
- * If the dnode moves here, we cannot cross this
- * barrier until the move completes.
- */
- DB_DNODE_ENTER(db);
-
- dn = DB_DNODE(db);
- atomic_dec_32(&dn->dn_dbufs_count);
-
- /*
- * Decrementing the dbuf count means that the bonus
- * buffer's dnode hold is no longer discounted in
- * dnode_move(). The dnode cannot move until after
- * the dnode_rele() below.
- */
- DB_DNODE_EXIT(db);
-
- /*
- * Do not reference db after its lock is dropped.
- * Another thread may evict it.
- */
- mutex_exit(&db->db_mtx);
-
- if (evict_dbuf)
- dnode_evict_bonus(dn);
-
- dnode_rele(dn, db);
- } else if (db->db_buf == NULL) {
- /*
- * This is a special case: we never associated this
- * dbuf with any data allocated from the ARC.
- */
- ASSERT(db->db_state == DB_UNCACHED ||
- db->db_state == DB_NOFILL);
- dbuf_destroy(db);
- } else if (arc_released(db->db_buf)) {
- /*
- * This dbuf has anonymous data associated with it.
- */
- dbuf_destroy(db);
- } else {
- boolean_t do_arc_evict = B_FALSE;
- blkptr_t bp;
- spa_t *spa = dmu_objset_spa(db->db_objset);
-
- if (!DBUF_IS_CACHEABLE(db) &&
- db->db_blkptr != NULL &&
- !BP_IS_HOLE(db->db_blkptr) &&
- !BP_IS_EMBEDDED(db->db_blkptr)) {
- do_arc_evict = B_TRUE;
- bp = *db->db_blkptr;
- }
-
- if (!DBUF_IS_CACHEABLE(db) ||
- db->db_pending_evict) {
- dbuf_destroy(db);
- } else if (!multilist_link_active(&db->db_cache_link)) {
- ASSERT3U(db->db_caching_status, ==,
- DB_NO_CACHE);
-
- dbuf_cached_state_t dcs =
- dbuf_include_in_metadata_cache(db) ?
- DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
- db->db_caching_status = dcs;
-
- multilist_insert(dbuf_caches[dcs].cache, db);
- size = zfs_refcount_add_many(
- &dbuf_caches[dcs].size, db->db.db_size, db);
-
- if (dcs == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMP(metadata_cache_count);
- DBUF_STAT_MAX(
- metadata_cache_size_bytes_max,
- size);
- } else {
- DBUF_STAT_BUMP(
- cache_levels[db->db_level]);
- DBUF_STAT_BUMP(cache_count);
- DBUF_STAT_INCR(
- cache_levels_bytes[db->db_level],
- db->db.db_size);
- DBUF_STAT_MAX(cache_size_bytes_max,
- size);
- }
- mutex_exit(&db->db_mtx);
-
- if (dcs == DB_DBUF_CACHE && !evicting)
- dbuf_evict_notify(size);
- }
-
- if (do_arc_evict)
- arc_freed(spa, &bp);
- }
- } else {
- mutex_exit(&db->db_mtx);
- }
-
-}
-
-#pragma weak dmu_buf_refcount = dbuf_refcount
-uint64_t
-dbuf_refcount(dmu_buf_impl_t *db)
-{
- return (zfs_refcount_count(&db->db_holds));
-}
-
-void *
-dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
- dmu_buf_user_t *new_user)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- mutex_enter(&db->db_mtx);
- dbuf_verify_user(db, DBVU_NOT_EVICTING);
- if (db->db_user == old_user)
- db->db_user = new_user;
- else
- old_user = db->db_user;
- dbuf_verify_user(db, DBVU_NOT_EVICTING);
- mutex_exit(&db->db_mtx);
-
- return (old_user);
-}
-
-void *
-dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
- return (dmu_buf_replace_user(db_fake, NULL, user));
-}
-
-void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- db->db_user_immediate_evict = TRUE;
- return (dmu_buf_set_user(db_fake, user));
-}
-
-void *
-dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
- return (dmu_buf_replace_user(db_fake, user, NULL));
-}
-
-void *
-dmu_buf_get_user(dmu_buf_t *db_fake)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- dbuf_verify_user(db, DBVU_NOT_EVICTING);
- return (db->db_user);
-}
-
-void
-dmu_buf_user_evict_wait()
-{
- taskq_wait(dbu_evict_taskq);
-}
-
-blkptr_t *
-dmu_buf_get_blkptr(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- return (dbi->db_blkptr);
-}
-
-objset_t *
-dmu_buf_get_objset(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- return (dbi->db_objset);
-}
-
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_ENTER(dbi);
- return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_EXIT(dbi);
-}
-
-static void
-dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
-{
- /* ASSERT(dmu_tx_is_syncing(tx) */
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (db->db_blkptr != NULL)
- return;
-
- if (db->db_blkid == DMU_SPILL_BLKID) {
- db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
- BP_ZERO(db->db_blkptr);
- return;
- }
- if (db->db_level == dn->dn_phys->dn_nlevels-1) {
- /*
- * This buffer was allocated at a time when there was
- * no available blkptrs from the dnode, or it was
- * inappropriate to hook it in (i.e., nlevels mis-match).
- */
- ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
- ASSERT(db->db_parent == NULL);
- db->db_parent = dn->dn_dbuf;
- db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
- DBUF_VERIFY(db);
- } else {
- dmu_buf_impl_t *parent = db->db_parent;
- int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- ASSERT(dn->dn_phys->dn_nlevels > 1);
- if (parent == NULL) {
- mutex_exit(&db->db_mtx);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- parent = dbuf_hold_level(dn, db->db_level + 1,
- db->db_blkid >> epbs, db);
- rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- db->db_parent = parent;
- }
- db->db_blkptr = (blkptr_t *)parent->db.db_data +
- (db->db_blkid & ((1ULL << epbs) - 1));
- DBUF_VERIFY(db);
- }
-}
-
-/*
- * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
- * is critical the we not allow the compiler to inline this function in to
- * dbuf_sync_list() thereby drastically bloating the stack usage.
- */
-noinline static void
-dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
- zio_t *zio;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
- mutex_enter(&db->db_mtx);
-
- ASSERT(db->db_level > 0);
- DBUF_VERIFY(db);
-
- /* Read the block if it hasn't been read yet. */
- if (db->db_buf == NULL) {
- mutex_exit(&db->db_mtx);
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- mutex_enter(&db->db_mtx);
- }
- ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT(db->db_buf != NULL);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- /* Indirect block size must match what the dnode thinks it is. */
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- dbuf_check_blkptr(dn, db);
- DB_DNODE_EXIT(db);
-
- /* Provide the pending dirty record to child dbufs */
- db->db_data_pending = dr;
-
- mutex_exit(&db->db_mtx);
-
- dbuf_write(dr, db->db_buf, tx);
-
- zio = dr->dr_zio;
- mutex_enter(&dr->dt.di.dr_mtx);
- dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- mutex_exit(&dr->dt.di.dr_mtx);
- zio_nowait(zio);
-}
-
-/*
- * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
- * critical the we not allow the compiler to inline this function in to
- * dbuf_sync_list() thereby drastically bloating the stack usage.
- */
-noinline static void
-dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
- arc_buf_t **datap = &dr->dt.dl.dr_data;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
- objset_t *os;
- uint64_t txg = tx->tx_txg;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
- mutex_enter(&db->db_mtx);
- /*
- * To be synced, we must be dirtied. But we
- * might have been freed after the dirty.
- */
- if (db->db_state == DB_UNCACHED) {
- /* This buffer has been freed since it was dirtied */
- ASSERT(db->db.db_data == NULL);
- } else if (db->db_state == DB_FILL) {
- /* This buffer was freed and is now being re-filled */
- ASSERT(db->db.db_data != dr->dt.dl.dr_data);
- } else {
- ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
- }
- DBUF_VERIFY(db);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
-
- if (db->db_blkid == DMU_SPILL_BLKID) {
- mutex_enter(&dn->dn_mtx);
- if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
- /*
- * In the previous transaction group, the bonus buffer
- * was entirely used to store the attributes for the
- * dnode which overrode the dn_spill field. However,
- * when adding more attributes to the file a spill
- * block was required to hold the extra attributes.
- *
- * Make sure to clear the garbage left in the dn_spill
- * field from the previous attributes in the bonus
- * buffer. Otherwise, after writing out the spill
- * block to the new allocated dva, it will free
- * the old block pointed to by the invalid dn_spill.
- */
- db->db_blkptr = NULL;
- }
- dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
- mutex_exit(&dn->dn_mtx);
- }
-
- /*
- * If this is a bonus buffer, simply copy the bonus data into the
- * dnode. It will be written out when the dnode is synced (and it
- * will be synced, since it must have been dirty for dbuf_sync to
- * be called).
- */
- if (db->db_blkid == DMU_BONUS_BLKID) {
- dbuf_dirty_record_t **drp;
-
- ASSERT(*datap != NULL);
- ASSERT0(db->db_level);
- ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
- DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(*datap, DN_BONUS(dn->dn_phys),
- DN_MAX_BONUS_LEN(dn->dn_phys));
- DB_DNODE_EXIT(db);
-
- if (*datap != db->db.db_data) {
- int slots = DB_DNODE(db)->dn_num_slots;
- int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- zio_buf_free(*datap, bonuslen);
- arc_space_return(bonuslen, ARC_SPACE_BONUS);
- }
- db->db_data_pending = NULL;
- drp = &db->db_last_dirty;
- while (*drp != dr)
- drp = &(*drp)->dr_next;
- ASSERT(dr->dr_next == NULL);
- ASSERT(dr->dr_dbuf == db);
- *drp = dr->dr_next;
- if (dr->dr_dbuf->db_level != 0) {
- mutex_destroy(&dr->dt.di.dr_mtx);
- list_destroy(&dr->dt.di.dr_children);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
- return;
- }
-
- os = dn->dn_objset;
-
- /*
- * This function may have dropped the db_mtx lock allowing a dmu_sync
- * operation to sneak in. As a result, we need to ensure that we
- * don't check the dr_override_state until we have returned from
- * dbuf_check_blkptr.
- */
- dbuf_check_blkptr(dn, db);
-
- /*
- * If this buffer is in the middle of an immediate write,
- * wait for the synchronous IO to complete.
- */
- while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
- cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
- }
-
- if (db->db_state != DB_NOFILL &&
- dn->dn_object != DMU_META_DNODE_OBJECT &&
- zfs_refcount_count(&db->db_holds) > 1 &&
- dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
- *datap == db->db_buf) {
- /*
- * If this buffer is currently "in use" (i.e., there
- * are active holds and db_data still references it),
- * then make a copy before we start the write so that
- * any modifications from the open txg will not leak
- * into this write.
- *
- * NOTE: this copy does not need to be made for
- * objects only modified in the syncing context (e.g.
- * DNONE_DNODE blocks).
- */
- int psize = arc_buf_size(*datap);
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- enum zio_compress compress_type = arc_get_compression(*datap);
-
- if (compress_type == ZIO_COMPRESS_OFF) {
- *datap = arc_alloc_buf(os->os_spa, db, type, psize);
- } else {
- ASSERT3U(type, ==, ARC_BUFC_DATA);
- int lsize = arc_buf_lsize(*datap);
- *datap = arc_alloc_compressed_buf(os->os_spa, db,
- psize, lsize, compress_type);
- }
- bcopy(db->db.db_data, (*datap)->b_data, psize);
- }
- db->db_data_pending = dr;
-
- mutex_exit(&db->db_mtx);
-
- dbuf_write(dr, *datap, tx);
-
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT) {
- list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- DB_DNODE_EXIT(db);
- } else {
- /*
- * Although zio_nowait() does not "wait for an IO", it does
- * initiate the IO. If this is an empty write it seems plausible
- * that the IO could actually be completed before the nowait
- * returns. We need to DB_DNODE_EXIT() first in case
- * zio_nowait() invalidates the dbuf.
- */
- DB_DNODE_EXIT(db);
- zio_nowait(dr->dr_zio);
- }
-}
-
-void
-dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
-{
- dbuf_dirty_record_t *dr;
-
- while (dr = list_head(list)) {
- if (dr->dr_zio != NULL) {
- /*
- * If we find an already initialized zio then we
- * are processing the meta-dnode, and we have finished.
- * The dbufs for all dnodes are put back on the list
- * during processing, so that we can zio_wait()
- * these IOs after initiating all child IOs.
- */
- ASSERT3U(dr->dr_dbuf->db.db_object, ==,
- DMU_META_DNODE_OBJECT);
- break;
- }
- if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- VERIFY3U(dr->dr_dbuf->db_level, ==, level);
- }
- list_remove(list, dr);
- if (dr->dr_dbuf->db_level > 0)
- dbuf_sync_indirect(dr, tx);
- else
- dbuf_sync_leaf(dr, tx);
- }
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
- dnode_t *dn;
- blkptr_t *bp = zio->io_bp;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- spa_t *spa = zio->io_spa;
- int64_t delta;
- uint64_t fill = 0;
- int i;
-
- ASSERT3P(db->db_blkptr, !=, NULL);
- ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
- dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
- zio->io_prev_space_delta = delta;
-
- if (bp->blk_birth != 0) {
- ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_type) ||
- (db->db_blkid == DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_bonustype) ||
- BP_IS_EMBEDDED(bp));
- ASSERT(BP_GET_LEVEL(bp) == db->db_level);
- }
-
- mutex_enter(&db->db_mtx);
-
-#ifdef ZFS_DEBUG
- if (db->db_blkid == DMU_SPILL_BLKID) {
- ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
- ASSERT(!(BP_IS_HOLE(bp)) &&
- db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
- }
-#endif
-
- if (db->db_level == 0) {
- mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
- db->db_blkid != DMU_SPILL_BLKID)
- dn->dn_phys->dn_maxblkid = db->db_blkid;
- mutex_exit(&dn->dn_mtx);
-
- if (dn->dn_type == DMU_OT_DNODE) {
- i = 0;
- while (i < db->db.db_size) {
- dnode_phys_t *dnp =
- (void *)(((char *)db->db.db_data) + i);
-
- i += DNODE_MIN_SIZE;
- if (dnp->dn_type != DMU_OT_NONE) {
- fill++;
- i += dnp->dn_extra_slots *
- DNODE_MIN_SIZE;
- }
- }
- } else {
- if (BP_IS_HOLE(bp)) {
- fill = 0;
- } else {
- fill = 1;
- }
- }
- } else {
- blkptr_t *ibp = db->db.db_data;
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
- if (BP_IS_HOLE(ibp))
- continue;
- fill += BP_GET_FILL(ibp);
- }
- }
- DB_DNODE_EXIT(db);
-
- if (!BP_IS_EMBEDDED(bp))
- bp->blk_fill = fill;
-
- mutex_exit(&db->db_mtx);
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- *db->db_blkptr = *bp;
- rw_exit(&dn->dn_struct_rwlock);
-}
-
-/* ARGSUSED */
-/*
- * This function gets called just prior to running through the compression
- * stage of the zio pipeline. If we're an indirect block comprised of only
- * holes, then we want this indirect to be compressed away to a hole. In
- * order to do that we must zero out any information about the holes that
- * this indirect points to prior to before we try to compress it.
- */
-static void
-dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
- dnode_t *dn;
- blkptr_t *bp;
- unsigned int epbs, i;
-
- ASSERT3U(db->db_level, >, 0);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- ASSERT3U(epbs, <, 31);
-
- /* Determine if all our children are holes */
- for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
- if (!BP_IS_HOLE(bp))
- break;
- }
-
- /*
- * If all the children are holes, then zero them all out so that
- * we may get compressed away.
- */
- if (i == 1 << epbs) {
- /*
- * We only found holes. Grab the rwlock to prevent
- * anybody from reading the blocks we're about to
- * zero out.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- bzero(db->db.db_data, db->db.db_size);
- rw_exit(&dn->dn_struct_rwlock);
- }
- DB_DNODE_EXIT(db);
-}
-
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times). This
- * allows the DMU to monitor the progress of each logical i/o. For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block. There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- dmu_buf_impl_t *db = arg;
- objset_t *os = db->db_objset;
- dsl_pool_t *dp = dmu_objset_pool(os);
- dbuf_dirty_record_t *dr;
- int delta = 0;
-
- dr = db->db_data_pending;
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dsl_pool_sync()'s call to
- * dsl_pool_undirty_space().
- */
- delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- blkptr_t *bp = db->db_blkptr;
- objset_t *os = db->db_objset;
- dmu_tx_t *tx = os->os_synctx;
- dbuf_dirty_record_t **drp, *dr;
-
- ASSERT0(zio->io_error);
- ASSERT(db->db_blkptr == bp);
-
- /*
- * For nopwrites and rewrites we ensure that the bp matches our
- * original and bypass all the accounting.
- */
- if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
- ASSERT(BP_EQUAL(bp, bp_orig));
- } else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
- dsl_dataset_block_born(ds, bp, tx);
- }
-
- mutex_enter(&db->db_mtx);
-
- DBUF_VERIFY(db);
-
- drp = &db->db_last_dirty;
- while ((dr = *drp) != db->db_data_pending)
- drp = &dr->dr_next;
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- ASSERT(dr->dr_dbuf == db);
- ASSERT(dr->dr_next == NULL);
- *drp = dr->dr_next;
-
-#ifdef ZFS_DEBUG
- if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
- ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
- db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
- DB_DNODE_EXIT(db);
- }
-#endif
-
- if (db->db_level == 0) {
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
- if (db->db_state != DB_NOFILL) {
- if (dr->dt.dl.dr_data != db->db_buf)
- arc_buf_destroy(dr->dt.dl.dr_data, db);
- }
- } else {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
- if (!BP_IS_HOLE(db->db_blkptr)) {
- int epbs =
- dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- ASSERT3U(db->db_blkid, <=,
- dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
- db->db.db_size);
- }
- DB_DNODE_EXIT(db);
- mutex_destroy(&dr->dt.di.dr_mtx);
- list_destroy(&dr->dt.di.dr_children);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
- cv_broadcast(&db->db_changed);
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- db->db_data_pending = NULL;
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
-}
-
-static void
-dbuf_write_nofill_ready(zio_t *zio)
-{
- dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-static void
-dbuf_write_nofill_done(zio_t *zio)
-{
- dbuf_write_done(zio, NULL, zio->io_private);
-}
-
-static void
-dbuf_write_override_ready(zio_t *zio)
-{
- dbuf_dirty_record_t *dr = zio->io_private;
- dmu_buf_impl_t *db = dr->dr_dbuf;
-
- dbuf_write_ready(zio, NULL, db);
-}
-
-static void
-dbuf_write_override_done(zio_t *zio)
-{
- dbuf_dirty_record_t *dr = zio->io_private;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
-
- mutex_enter(&db->db_mtx);
- if (!BP_EQUAL(zio->io_bp, obp)) {
- if (!BP_IS_HOLE(obp))
- dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
- arc_release(dr->dt.dl.dr_data, db);
- }
- mutex_exit(&db->db_mtx);
- dbuf_write_done(zio, NULL, db);
-
- if (zio->io_abd != NULL)
- abd_put(zio->io_abd);
-}
-
-typedef struct dbuf_remap_impl_callback_arg {
- objset_t *drica_os;
- uint64_t drica_blk_birth;
- dmu_tx_t *drica_tx;
-} dbuf_remap_impl_callback_arg_t;
-
-static void
-dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
- void *arg)
-{
- dbuf_remap_impl_callback_arg_t *drica = arg;
- objset_t *os = drica->drica_os;
- spa_t *spa = dmu_objset_spa(os);
- dmu_tx_t *tx = drica->drica_tx;
-
- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
- if (os == spa_meta_objset(spa)) {
- spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
- } else {
- dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
- size, drica->drica_blk_birth, tx);
- }
-}
-
-static void
-dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
-{
- blkptr_t bp_copy = *bp;
- spa_t *spa = dmu_objset_spa(dn->dn_objset);
- dbuf_remap_impl_callback_arg_t drica;
-
- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
- drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = bp->blk_birth;
- drica.drica_tx = tx;
- if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
- &drica)) {
- /*
- * The struct_rwlock prevents dbuf_read_impl() from
- * dereferencing the BP while we are changing it. To
- * avoid lock contention, only grab it when we are actually
- * changing the BP.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- *bp = bp_copy;
- rw_exit(&dn->dn_struct_rwlock);
- }
-}
-
-/*
- * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
- * to remap a copy of every bp in the dbuf.
- */
-boolean_t
-dbuf_can_remap(const dmu_buf_impl_t *db)
-{
- spa_t *spa = dmu_objset_spa(db->db_objset);
- blkptr_t *bp = db->db.db_data;
- boolean_t ret = B_FALSE;
-
- ASSERT3U(db->db_level, >, 0);
- ASSERT3S(db->db_state, ==, DB_CACHED);
-
- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
- blkptr_t bp_copy = bp[i];
- if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
- ret = B_TRUE;
- break;
- }
- }
- spa_config_exit(spa, SCL_VDEV, FTAG);
-
- return (ret);
-}
-
-boolean_t
-dnode_needs_remap(const dnode_t *dn)
-{
- spa_t *spa = dmu_objset_spa(dn->dn_objset);
- boolean_t ret = B_FALSE;
-
- if (dn->dn_phys->dn_nlevels == 0) {
- return (B_FALSE);
- }
-
- ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
- blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
- if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
- ret = B_TRUE;
- break;
- }
- }
- spa_config_exit(spa, SCL_VDEV, FTAG);
-
- return (ret);
-}
-
-/*
- * Remap any existing BP's to concrete vdevs, if possible.
- */
-static void
-dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- spa_t *spa = dmu_objset_spa(db->db_objset);
- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
- if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
- return;
-
- if (db->db_level > 0) {
- blkptr_t *bp = db->db.db_data;
- for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
- dbuf_remap_impl(dn, &bp[i], tx);
- }
- } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
- dnode_phys_t *dnp = db->db.db_data;
- ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
- DMU_OT_DNODE);
- for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
- i += dnp[i].dn_extra_slots + 1) {
- for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
- dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
- }
- }
- }
-}
-
-
-/* Issue I/O to commit a dirty buffer to disk. */
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
- objset_t *os;
- dmu_buf_impl_t *parent = db->db_parent;
- uint64_t txg = tx->tx_txg;
- zbookmark_phys_t zb;
- zio_prop_t zp;
- zio_t *zio;
- int wp_flag = 0;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- os = dn->dn_objset;
-
- if (db->db_state != DB_NOFILL) {
- if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- if (BP_IS_HOLE(db->db_blkptr)) {
- arc_buf_thaw(data);
- } else {
- dbuf_release_bp(db);
- }
- dbuf_remap(dn, db, tx);
- }
- }
-
- if (parent != dn->dn_dbuf) {
- /* Our parent is an indirect block. */
- /* We have a dirty parent that has been scheduled for write. */
- ASSERT(parent && parent->db_data_pending);
- /* Our parent's buffer is one level closer to the dnode. */
- ASSERT(db->db_level == parent->db_level-1);
- /*
- * We're about to modify our parent's db_data by modifying
- * our block pointer, so the parent must be released.
- */
- ASSERT(arc_released(parent->db_buf));
- zio = parent->db_data_pending->dr_zio;
- } else {
- /* Our parent is the dnode itself. */
- ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
- db->db_blkid != DMU_SPILL_BLKID) ||
- (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
- if (db->db_blkid != DMU_SPILL_BLKID)
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- zio = dn->dn_zio;
- }
-
- ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
- ASSERT(zio);
-
- SET_BOOKMARK(&zb, os->os_dsl_dataset ?
- os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
- db->db.db_object, db->db_level, db->db_blkid);
-
- if (db->db_blkid == DMU_SPILL_BLKID)
- wp_flag = WP_SPILL;
- wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
-
- dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
- DB_DNODE_EXIT(db);
-
- /*
- * We copy the blkptr now (rather than when we instantiate the dirty
- * record), because its value can change between open context and
- * syncing context. We do not need to hold dn_struct_rwlock to read
- * db_blkptr because we are in syncing context.
- */
- dr->dr_bp_copy = *db->db_blkptr;
-
- if (db->db_level == 0 &&
- dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- /*
- * The BP for this block has been provided by open context
- * (by dmu_sync() or dmu_buf_write_embedded()).
- */
- abd_t *contents = (data != NULL) ?
- abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
-
- dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
- contents, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_override_ready, NULL, NULL,
- dbuf_write_override_done,
- dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- mutex_enter(&db->db_mtx);
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
- mutex_exit(&db->db_mtx);
- } else if (db->db_state == DB_NOFILL) {
- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
- zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
- dr->dr_zio = zio_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_nofill_ready, NULL, NULL,
- dbuf_write_nofill_done, db,
- ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
- } else {
- ASSERT(arc_released(data));
-
- /*
- * For indirect blocks, we want to setup the children
- * ready callback so that we can properly handle an indirect
- * block that only contains holes.
- */
- arc_write_done_func_t *children_ready_cb = NULL;
- if (db->db_level != 0)
- children_ready_cb = dbuf_write_children_ready;
-
- dr->dr_zio = arc_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
- &zp, dbuf_write_ready, children_ready_cb,
- dbuf_write_physdone, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- }
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
deleted file mode 100644
index 0a86830f71ad..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-
-/*
- * Calculate the index of the arc header for the state, disabled by default.
- */
-int zfs_dbuf_state_index = 0;
-
-/*
- * ==========================================================================
- * Dbuf Hash Read Routines
- * ==========================================================================
- */
-typedef struct dbuf_stats_t {
- kmutex_t lock;
- kstat_t *kstat;
- dbuf_hash_table_t *hash;
- int idx;
-} dbuf_stats_t;
-
-static dbuf_stats_t dbuf_stats_hash_table;
-
-static int
-dbuf_stats_hash_table_headers(char *buf, size_t size)
-{
- size = snprintf(buf, size - 1,
- "%-88s | %-124s | %s\n"
- "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
- "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
- "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
- "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
- "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
- "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
- "atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
- "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
- "dtype", "btype", "data_bs", "meta_bs", "bsize",
- "lvls", "dholds", "blocks", "dsize");
- buf[size] = '\0';
-
- return (0);
-}
-
-int
-__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
-{
- arc_buf_info_t abi = { 0 };
- dmu_object_info_t doi = { 0 };
- dnode_t *dn = DB_DNODE(db);
-
- if (db->db_buf)
- arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
-
- if (dn)
- __dmu_object_info_from_dnode(dn, &doi);
-
- size = snprintf(buf, size - 1,
- "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
- "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
- "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
- "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
- /* dmu_buf_impl_t */
- spa_name(dn->dn_objset->os_spa),
- (u_longlong_t)dmu_objset_id(db->db_objset),
- (longlong_t)db->db.db_object,
- (longlong_t)db->db_level,
- (longlong_t)db->db_blkid,
- (u_longlong_t)db->db.db_offset,
- (u_longlong_t)db->db.db_size,
- !!dbuf_is_metadata(db),
- db->db_state,
- (ulong_t)zfs_refcount_count(&db->db_holds),
- /* arc_buf_info_t */
- abi.abi_state_type,
- abi.abi_state_contents,
- (longlong_t)abi.abi_state_index,
- abi.abi_flags,
- (ulong_t)abi.abi_bufcnt,
- (u_longlong_t)abi.abi_size,
- (u_longlong_t)abi.abi_access,
- (ulong_t)abi.abi_mru_hits,
- (ulong_t)abi.abi_mru_ghost_hits,
- (ulong_t)abi.abi_mfu_hits,
- (ulong_t)abi.abi_mfu_ghost_hits,
- (ulong_t)abi.abi_l2arc_hits,
- (u_longlong_t)abi.abi_l2arc_dattr,
- (u_longlong_t)abi.abi_l2arc_asize,
- abi.abi_l2arc_compress,
- (ulong_t)abi.abi_holds,
- /* dmu_object_info_t */
- doi.doi_type,
- doi.doi_bonus_type,
- (ulong_t)doi.doi_data_block_size,
- (ulong_t)doi.doi_metadata_block_size,
- (u_longlong_t)doi.doi_bonus_size,
- (ulong_t)doi.doi_indirection,
- (ulong_t)zfs_refcount_count(&dn->dn_holds),
- (u_longlong_t)doi.doi_fill_count,
- (u_longlong_t)doi.doi_max_offset);
- buf[size] = '\0';
-
- return (size);
-}
-
-static int
-dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
-{
- dbuf_stats_t *dsh = (dbuf_stats_t *)data;
- dbuf_hash_table_t *h = dsh->hash;
- dmu_buf_impl_t *db;
- int length, error = 0;
-
- ASSERT3S(dsh->idx, >=, 0);
- ASSERT3S(dsh->idx, <=, h->hash_table_mask);
- memset(buf, 0, size);
-
- mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
- for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
- /*
- * Returning ENOMEM will cause the data and header functions
- * to be called with a larger scratch buffers.
- */
- if (size < 512) {
- error = ENOMEM;
- break;
- }
-
- mutex_enter(&db->db_mtx);
- mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
-
- length = __dbuf_stats_hash_table_data(buf, size, db);
- buf += length;
- size -= length;
-
- mutex_exit(&db->db_mtx);
- mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
- }
- mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
-
- return (error);
-}
-
-static void *
-dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n)
-{
- dbuf_stats_t *dsh = ksp->ks_private;
-
- ASSERT(MUTEX_HELD(&dsh->lock));
-
- if (n <= dsh->hash->hash_table_mask) {
- dsh->idx = n;
- return (dsh);
- }
-
- return (NULL);
-}
-
-#ifndef __FreeBSD__
-/*
- * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW
- * we can enable this as soon as that's implemented. See the
- * lindebugfs module for similar callback semantics.
- */
-static void
-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
-{
- dbuf_stats_t *dsh = &dbuf_stats_hash_table;
- kstat_t *ksp;
-
- mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
- dsh->hash = hash;
-
- ksp = kstat_create("zfs", 0, "dbufs", "misc",
- KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
- dsh->kstat = ksp;
-
- if (ksp) {
- ksp->ks_lock = &dsh->lock;
- ksp->ks_ndata = UINT32_MAX;
- ksp->ks_private = dsh;
- kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
- dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
- kstat_install(ksp);
- }
-}
-
-static void
-dbuf_stats_hash_table_destroy(void)
-{
- dbuf_stats_t *dsh = &dbuf_stats_hash_table;
- kstat_t *ksp;
-
- ksp = dsh->kstat;
- if (ksp)
- kstat_delete(ksp);
-
- mutex_destroy(&dsh->lock);
-}
-#else
-static void
-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
-{
-}
-
-static void
-dbuf_stats_hash_table_destroy(void)
-{
-}
-#endif
-
-void
-dbuf_stats_init(dbuf_hash_table_t *hash)
-{
- dbuf_stats_hash_table_init(hash);
-}
-
-void
-dbuf_stats_destroy(void)
-{
- dbuf_stats_hash_table_destroy();
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
deleted file mode 100644
index 964aa6c054f5..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
+++ /dev/null
@@ -1,1189 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/ddt.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/dsl_pool.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dsl_scan.h>
-#include <sys/abd.h>
-
-/*
- * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
- */
-int zfs_dedup_prefetch = 1;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "ZFS DEDUP");
-SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
- 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
-
-static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
- &ddt_zap_ops,
-};
-
-static const char *ddt_class_name[DDT_CLASSES] = {
- "ditto",
- "duplicate",
- "unique",
-};
-
-static void
-ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- dmu_tx_t *tx)
-{
- spa_t *spa = ddt->ddt_spa;
- objset_t *os = ddt->ddt_os;
- uint64_t *objectp = &ddt->ddt_object[type][class];
- boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
- ZCHECKSUM_FLAG_DEDUP;
- char name[DDT_NAMELEN];
-
- ddt_object_name(ddt, type, class, name);
-
- ASSERT(*objectp == 0);
- VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
- ASSERT(*objectp != 0);
-
- VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
- sizeof (uint64_t), 1, objectp, tx) == 0);
-
- VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class], tx) == 0);
-}
-
-static void
-ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- dmu_tx_t *tx)
-{
- spa_t *spa = ddt->ddt_spa;
- objset_t *os = ddt->ddt_os;
- uint64_t *objectp = &ddt->ddt_object[type][class];
- uint64_t count;
- char name[DDT_NAMELEN];
-
- ddt_object_name(ddt, type, class, name);
-
- ASSERT(*objectp != 0);
- VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
- ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
- VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
- VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
- VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
- bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
-
- *objectp = 0;
-}
-
-static int
-ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
- ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
- dmu_object_info_t doi;
- uint64_t count;
- char name[DDT_NAMELEN];
- int error;
-
- ddt_object_name(ddt, type, class, name);
-
- error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
- sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
-
- if (error != 0)
- return (error);
-
- VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class]));
-
- /*
- * Seed the cached statistics.
- */
- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
-
- error = ddt_object_count(ddt, type, class, &count);
- if (error)
- return error;
-
- ddo->ddo_count = count;
- ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
- ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
-
- return (0);
-}
-
-static void
-ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- dmu_tx_t *tx)
-{
- ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
- dmu_object_info_t doi;
- uint64_t count;
- char name[DDT_NAMELEN];
-
- ddt_object_name(ddt, type, class, name);
-
- VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
- sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class], tx) == 0);
-
- /*
- * Cache DDT statistics; this is the only time they'll change.
- */
- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
- VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
-
- ddo->ddo_count = count;
- ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
- ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
-}
-
-static int
-ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde)
-{
- if (!ddt_object_exists(ddt, type, class))
- return (SET_ERROR(ENOENT));
-
- return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
- ddt->ddt_object[type][class], dde));
-}
-
-static void
-ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde)
-{
- if (!ddt_object_exists(ddt, type, class))
- return;
-
- ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
- ddt->ddt_object[type][class], dde);
-}
-
-int
-ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde, dmu_tx_t *tx)
-{
- ASSERT(ddt_object_exists(ddt, type, class));
-
- return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, tx));
-}
-
-static int
-ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde, dmu_tx_t *tx)
-{
- ASSERT(ddt_object_exists(ddt, type, class));
-
- return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, tx));
-}
-
-int
-ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- uint64_t *walk, ddt_entry_t *dde)
-{
- ASSERT(ddt_object_exists(ddt, type, class));
-
- return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, walk));
-}
-
-int
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count)
-{
- ASSERT(ddt_object_exists(ddt, type, class));
-
- return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
- ddt->ddt_object[type][class], count));
-}
-
-int
-ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- dmu_object_info_t *doi)
-{
- if (!ddt_object_exists(ddt, type, class))
- return (SET_ERROR(ENOENT));
-
- return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
- doi));
-}
-
-boolean_t
-ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
- return (!!ddt->ddt_object[type][class]);
-}
-
-void
-ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- char *name)
-{
- (void) sprintf(name, DMU_POOL_DDT,
- zio_checksum_table[ddt->ddt_checksum].ci_name,
- ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
-}
-
-void
-ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
-{
- ASSERT(txg != 0);
-
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
- bp->blk_dva[d] = ddp->ddp_dva[d];
- BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
-}
-
-void
-ddt_bp_create(enum zio_checksum checksum,
- const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
-{
- BP_ZERO(bp);
-
- if (ddp != NULL)
- ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
-
- bp->blk_cksum = ddk->ddk_cksum;
- bp->blk_fill = 1;
-
- BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
- BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
- BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
- BP_SET_CHECKSUM(bp, checksum);
- BP_SET_TYPE(bp, DMU_OT_DEDUP);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-}
-
-void
-ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
-{
- ddk->ddk_cksum = bp->blk_cksum;
- ddk->ddk_prop = 0;
-
- DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
- DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
- DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
-}
-
-void
-ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
-{
- ASSERT(ddp->ddp_phys_birth == 0);
-
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
- ddp->ddp_dva[d] = bp->blk_dva[d];
- ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
-}
-
-void
-ddt_phys_clear(ddt_phys_t *ddp)
-{
- bzero(ddp, sizeof (*ddp));
-}
-
-void
-ddt_phys_addref(ddt_phys_t *ddp)
-{
- ddp->ddp_refcnt++;
-}
-
-void
-ddt_phys_decref(ddt_phys_t *ddp)
-{
- if (ddp) {
- ASSERT((int64_t)ddp->ddp_refcnt > 0);
- ddp->ddp_refcnt--;
- }
-}
-
-void
-ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
-{
- blkptr_t blk;
-
- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
- ddt_phys_clear(ddp);
- zio_free(ddt->ddt_spa, txg, &blk);
-}
-
-ddt_phys_t *
-ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
-{
- ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
- BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
- return (ddp);
- }
- return (NULL);
-}
-
-uint64_t
-ddt_phys_total_refcnt(const ddt_entry_t *dde)
-{
- uint64_t refcnt = 0;
-
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
- refcnt += dde->dde_phys[p].ddp_refcnt;
-
- return (refcnt);
-}
-
-static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
-{
- spa_t *spa = ddt->ddt_spa;
- ddt_phys_t *ddp = dde->dde_phys;
- ddt_key_t *ddk = &dde->dde_key;
- uint64_t lsize = DDK_GET_LSIZE(ddk);
- uint64_t psize = DDK_GET_PSIZE(ddk);
-
- bzero(dds, sizeof (*dds));
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- uint64_t dsize = 0;
- uint64_t refcnt = ddp->ddp_refcnt;
-
- if (ddp->ddp_phys_birth == 0)
- continue;
-
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
- dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
-
- dds->dds_blocks += 1;
- dds->dds_lsize += lsize;
- dds->dds_psize += psize;
- dds->dds_dsize += dsize;
-
- dds->dds_ref_blocks += refcnt;
- dds->dds_ref_lsize += lsize * refcnt;
- dds->dds_ref_psize += psize * refcnt;
- dds->dds_ref_dsize += dsize * refcnt;
- }
-}
-
-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
-{
- const uint64_t *s = (const uint64_t *)src;
- uint64_t *d = (uint64_t *)dst;
- uint64_t *d_end = (uint64_t *)(dst + 1);
-
- ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
-
- while (d < d_end)
- *d++ += (*s++ ^ neg) - neg;
-}
-
-static void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
-{
- ddt_stat_t dds;
- ddt_histogram_t *ddh;
- int bucket;
-
- ddt_stat_generate(ddt, dde, &dds);
-
- bucket = highbit64(dds.dds_ref_blocks) - 1;
- ASSERT(bucket >= 0);
-
- ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
-
- ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
-}
-
-void
-ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
-{
- for (int h = 0; h < 64; h++)
- ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
-}
-
-void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
-{
- bzero(dds, sizeof (*dds));
-
- for (int h = 0; h < 64; h++)
- ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
-}
-
-boolean_t
-ddt_histogram_empty(const ddt_histogram_t *ddh)
-{
- const uint64_t *s = (const uint64_t *)ddh;
- const uint64_t *s_end = (const uint64_t *)(ddh + 1);
-
- while (s < s_end)
- if (*s++ != 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-void
-ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
-{
- /* Sum the statistics we cached in ddt_object_sync(). */
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
- ddt_t *ddt = spa->spa_ddt[c];
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
- ddt_object_t *ddo =
- &ddt->ddt_object_stats[type][class];
- ddo_total->ddo_count += ddo->ddo_count;
- ddo_total->ddo_dspace += ddo->ddo_dspace;
- ddo_total->ddo_mspace += ddo->ddo_mspace;
- }
- }
- }
-
- /* ... and compute the averages. */
- if (ddo_total->ddo_count != 0) {
- ddo_total->ddo_dspace /= ddo_total->ddo_count;
- ddo_total->ddo_mspace /= ddo_total->ddo_count;
- }
-}
-
-void
-ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
-{
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
- ddt_t *ddt = spa->spa_ddt[c];
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
- ddt_histogram_add(ddh,
- &ddt->ddt_histogram_cache[type][class]);
- }
- }
- }
-}
-
-void
-ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
-{
- ddt_histogram_t *ddh_total;
-
- ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
- ddt_get_dedup_histogram(spa, ddh_total);
- ddt_histogram_stat(dds_total, ddh_total);
- kmem_free(ddh_total, sizeof (ddt_histogram_t));
-}
-
-uint64_t
-ddt_get_dedup_dspace(spa_t *spa)
-{
- ddt_stat_t dds_total = { 0 };
-
- ddt_get_dedup_stats(spa, &dds_total);
- return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
-}
-
-uint64_t
-ddt_get_pool_dedup_ratio(spa_t *spa)
-{
- ddt_stat_t dds_total = { 0 };
-
- ddt_get_dedup_stats(spa, &dds_total);
- if (dds_total.dds_dsize == 0)
- return (100);
-
- return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
-}
-
-int
-ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
-{
- spa_t *spa = ddt->ddt_spa;
- uint64_t total_refcnt = 0;
- uint64_t ditto = spa->spa_dedup_ditto;
- int total_copies = 0;
- int desired_copies = 0;
-
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
- ddt_phys_t *ddp = &dde->dde_phys[p];
- zio_t *zio = dde->dde_lead_zio[p];
- uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
- if (zio != NULL)
- refcnt += zio->io_parent_count; /* pending refs */
- if (ddp == ddp_willref)
- refcnt++; /* caller's ref */
- if (refcnt != 0) {
- total_refcnt += refcnt;
- total_copies += p;
- }
- }
-
- if (ditto == 0 || ditto > UINT32_MAX)
- ditto = UINT32_MAX;
-
- if (total_refcnt >= 1)
- desired_copies++;
- if (total_refcnt >= ditto)
- desired_copies++;
- if (total_refcnt >= ditto * ditto)
- desired_copies++;
-
- return (MAX(desired_copies, total_copies) - total_copies);
-}
-
-int
-ddt_ditto_copies_present(ddt_entry_t *dde)
-{
- ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
- dva_t *dva = ddp->ddp_dva;
- int copies = 0 - DVA_GET_GANG(dva);
-
- for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
- if (DVA_IS_VALID(dva))
- copies++;
-
- ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
-
- return (copies);
-}
-
-size_t
-ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
-{
- uchar_t *version = dst++;
- int cpfunc = ZIO_COMPRESS_ZLE;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- size_t c_len;
-
- ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
-
- c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
-
- if (c_len == s_len) {
- cpfunc = ZIO_COMPRESS_OFF;
- bcopy(src, dst, s_len);
- }
-
- *version = cpfunc;
- /* CONSTCOND */
- if (ZFS_HOST_BYTEORDER)
- *version |= DDT_COMPRESS_BYTEORDER_MASK;
-
- return (c_len + 1);
-}
-
-void
-ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
-{
- uchar_t version = *src++;
- int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
- if (ci->ci_decompress != NULL)
- (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
- else
- bcopy(src, dst, d_len);
-
- if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
- (ZFS_HOST_BYTEORDER != 0))
- byteswap_uint64_array(dst, d_len);
-}
-
-ddt_t *
-ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
-{
- return (spa->spa_ddt[c]);
-}
-
-ddt_t *
-ddt_select(spa_t *spa, const blkptr_t *bp)
-{
- return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
-}
-
-void
-ddt_enter(ddt_t *ddt)
-{
- mutex_enter(&ddt->ddt_lock);
-}
-
-void
-ddt_exit(ddt_t *ddt)
-{
- mutex_exit(&ddt->ddt_lock);
-}
-
-static ddt_entry_t *
-ddt_alloc(const ddt_key_t *ddk)
-{
- ddt_entry_t *dde;
-
- dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
- cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
-
- dde->dde_key = *ddk;
-
- return (dde);
-}
-
-static void
-ddt_free(ddt_entry_t *dde)
-{
- ASSERT(!dde->dde_loading);
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++)
- ASSERT(dde->dde_lead_zio[p] == NULL);
-
- if (dde->dde_repair_abd != NULL)
- abd_free(dde->dde_repair_abd);
-
- cv_destroy(&dde->dde_cv);
- kmem_free(dde, sizeof (*dde));
-}
-
-void
-ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
-{
- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
-
- avl_remove(&ddt->ddt_tree, dde);
- ddt_free(dde);
-}
-
-ddt_entry_t *
-ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
-{
- ddt_entry_t *dde, dde_search;
- enum ddt_type type;
- enum ddt_class class;
- avl_index_t where;
- int error;
-
- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
-
- ddt_key_fill(&dde_search.dde_key, bp);
-
- dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
- if (dde == NULL) {
- if (!add)
- return (NULL);
- dde = ddt_alloc(&dde_search.dde_key);
- avl_insert(&ddt->ddt_tree, dde, where);
- }
-
- while (dde->dde_loading)
- cv_wait(&dde->dde_cv, &ddt->ddt_lock);
-
- if (dde->dde_loaded)
- return (dde);
-
- dde->dde_loading = B_TRUE;
-
- ddt_exit(ddt);
-
- error = ENOENT;
-
- for (type = 0; type < DDT_TYPES; type++) {
- for (class = 0; class < DDT_CLASSES; class++) {
- error = ddt_object_lookup(ddt, type, class, dde);
- if (error != ENOENT) {
- ASSERT0(error);
- break;
- }
- }
- if (error != ENOENT)
- break;
- }
-
- ddt_enter(ddt);
-
- ASSERT(dde->dde_loaded == B_FALSE);
- ASSERT(dde->dde_loading == B_TRUE);
-
- dde->dde_type = type; /* will be DDT_TYPES if no entry found */
- dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
- dde->dde_loaded = B_TRUE;
- dde->dde_loading = B_FALSE;
-
- if (error == 0)
- ddt_stat_update(ddt, dde, -1ULL);
-
- cv_broadcast(&dde->dde_cv);
-
- return (dde);
-}
-
-void
-ddt_prefetch(spa_t *spa, const blkptr_t *bp)
-{
- ddt_t *ddt;
- ddt_entry_t dde;
-
- if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
- return;
-
- /*
- * We only remove the DDT once all tables are empty and only
- * prefetch dedup blocks when there are entries in the DDT.
- * Thus no locking is required as the DDT can't disappear on us.
- */
- ddt = ddt_select(spa, bp);
- ddt_key_fill(&dde.dde_key, bp);
-
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
- ddt_object_prefetch(ddt, type, class, &dde);
- }
- }
-}
-
-/*
- * Opaque struct used for ddt_key comparison
- */
-#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
-
-typedef struct ddt_key_cmp {
- uint16_t u16[DDT_KEY_CMP_LEN];
-} ddt_key_cmp_t;
-
-int
-ddt_entry_compare(const void *x1, const void *x2)
-{
- const ddt_entry_t *dde1 = x1;
- const ddt_entry_t *dde2 = x2;
- const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
- const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
- int32_t cmp = 0;
-
- for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
- cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
- if (likely(cmp))
- break;
- }
-
- return (AVL_ISIGN(cmp));
-}
-
-static ddt_t *
-ddt_table_alloc(spa_t *spa, enum zio_checksum c)
-{
- ddt_t *ddt;
-
- ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
-
- mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&ddt->ddt_tree, ddt_entry_compare,
- sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
- avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
- sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
- ddt->ddt_checksum = c;
- ddt->ddt_spa = spa;
- ddt->ddt_os = spa->spa_meta_objset;
-
- return (ddt);
-}
-
-static void
-ddt_table_free(ddt_t *ddt)
-{
- ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
- ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
- avl_destroy(&ddt->ddt_tree);
- avl_destroy(&ddt->ddt_repair_tree);
- mutex_destroy(&ddt->ddt_lock);
- kmem_free(ddt, sizeof (*ddt));
-}
-
-void
-ddt_create(spa_t *spa)
-{
- spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
-
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
- spa->spa_ddt[c] = ddt_table_alloc(spa, c);
-}
-
-int
-ddt_load(spa_t *spa)
-{
- int error;
-
- ddt_create(spa);
-
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
- &spa->spa_ddt_stat_object);
-
- if (error)
- return (error == ENOENT ? 0 : error);
-
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-